qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

tcg: Implement gvec support for rotate by immediate

No host backend support yet, but the interfaces for rotli
are in place. Canonicalize immediate rotate to the left,
based on a survey of architectures, but provide both left
and right shift interfaces to the translators.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

+150 -1
+48
accel/tcg/tcg-runtime-gvec.c
··· 716 716 clear_high(d, oprsz, desc); 717 717 } 718 718 719 + void HELPER(gvec_rotl8i)(void *d, void *a, uint32_t desc) 720 + { 721 + intptr_t oprsz = simd_oprsz(desc); 722 + int shift = simd_data(desc); 723 + intptr_t i; 724 + 725 + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 726 + *(uint8_t *)(d + i) = rol8(*(uint8_t *)(a + i), shift); 727 + } 728 + clear_high(d, oprsz, desc); 729 + } 730 + 731 + void HELPER(gvec_rotl16i)(void *d, void *a, uint32_t desc) 732 + { 733 + intptr_t oprsz = simd_oprsz(desc); 734 + int shift = simd_data(desc); 735 + intptr_t i; 736 + 737 + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 738 + *(uint16_t *)(d + i) = rol16(*(uint16_t *)(a + i), shift); 739 + } 740 + clear_high(d, oprsz, desc); 741 + } 742 + 743 + void HELPER(gvec_rotl32i)(void *d, void *a, uint32_t desc) 744 + { 745 + intptr_t oprsz = simd_oprsz(desc); 746 + int shift = simd_data(desc); 747 + intptr_t i; 748 + 749 + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 750 + *(uint32_t *)(d + i) = rol32(*(uint32_t *)(a + i), shift); 751 + } 752 + clear_high(d, oprsz, desc); 753 + } 754 + 755 + void HELPER(gvec_rotl64i)(void *d, void *a, uint32_t desc) 756 + { 757 + intptr_t oprsz = simd_oprsz(desc); 758 + int shift = simd_data(desc); 759 + intptr_t i; 760 + 761 + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 762 + *(uint64_t *)(d + i) = rol64(*(uint64_t *)(a + i), shift); 763 + } 764 + clear_high(d, oprsz, desc); 765 + } 766 + 719 767 void HELPER(gvec_shl8v)(void *d, void *a, void *b, uint32_t desc) 720 768 { 721 769 intptr_t oprsz = simd_oprsz(desc);
+5
accel/tcg/tcg-runtime.h
··· 259 259 DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 260 260 DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 261 261 262 + DEF_HELPER_FLAGS_3(gvec_rotl8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 263 + DEF_HELPER_FLAGS_3(gvec_rotl16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 264 + DEF_HELPER_FLAGS_3(gvec_rotl32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 265 + DEF_HELPER_FLAGS_3(gvec_rotl64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) 266 + 262 267 DEF_HELPER_FLAGS_4(gvec_shl8v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) 263 268 DEF_HELPER_FLAGS_4(gvec_shl16v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) 264 269 DEF_HELPER_FLAGS_4(gvec_shl32v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+6
include/tcg/tcg-op-gvec.h
··· 334 334 int64_t shift, uint32_t oprsz, uint32_t maxsz); 335 335 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 336 336 int64_t shift, uint32_t oprsz, uint32_t maxsz); 337 + void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 338 + int64_t shift, uint32_t oprsz, uint32_t maxsz); 339 + void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 340 + int64_t shift, uint32_t oprsz, uint32_t maxsz); 337 341 338 342 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 339 343 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz); ··· 388 392 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); 389 393 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); 390 394 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); 395 + void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c); 396 + void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c); 391 397 392 398 #endif
+2
include/tcg/tcg-op.h
··· 999 999 void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); 1000 1000 void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); 1001 1001 void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); 1002 + void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); 1003 + void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); 1002 1004 1003 1005 void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s); 1004 1006 void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+1
include/tcg/tcg-opc.h
··· 248 248 DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) 249 249 DEF(shri_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) 250 250 DEF(sari_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) 251 + DEF(rotli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_roti_vec)) 251 252 252 253 DEF(shls_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) 253 254 DEF(shrs_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec))
+1
include/tcg/tcg.h
··· 182 182 #define TCG_TARGET_HAS_not_vec 0 183 183 #define TCG_TARGET_HAS_andc_vec 0 184 184 #define TCG_TARGET_HAS_orc_vec 0 185 + #define TCG_TARGET_HAS_roti_vec 0 185 186 #define TCG_TARGET_HAS_shi_vec 0 186 187 #define TCG_TARGET_HAS_shs_vec 0 187 188 #define TCG_TARGET_HAS_shv_vec 0
+2 -1
tcg/README
··· 605 605 606 606 * shri_vec v0, v1, i2 607 607 * sari_vec v0, v1, i2 608 + * rotli_vec v0, v1, i2 608 609 * shrs_vec v0, v1, s2 609 610 * sars_vec v0, v1, s2 610 611 611 - Similarly for logical and arithmetic right shift. 612 + Similarly for logical and arithmetic right shift, and left rotate. 612 613 613 614 * shlv_vec v0, v1, v2 614 615
+1
tcg/aarch64/tcg-target.h
··· 133 133 #define TCG_TARGET_HAS_not_vec 1 134 134 #define TCG_TARGET_HAS_neg_vec 1 135 135 #define TCG_TARGET_HAS_abs_vec 1 136 + #define TCG_TARGET_HAS_roti_vec 0 136 137 #define TCG_TARGET_HAS_shi_vec 1 137 138 #define TCG_TARGET_HAS_shs_vec 0 138 139 #define TCG_TARGET_HAS_shv_vec 1
+1
tcg/i386/tcg-target.h
··· 183 183 #define TCG_TARGET_HAS_not_vec 0 184 184 #define TCG_TARGET_HAS_neg_vec 0 185 185 #define TCG_TARGET_HAS_abs_vec 1 186 + #define TCG_TARGET_HAS_roti_vec 0 186 187 #define TCG_TARGET_HAS_shi_vec 1 187 188 #define TCG_TARGET_HAS_shs_vec 1 188 189 #define TCG_TARGET_HAS_shv_vec have_avx2
+1
tcg/ppc/tcg-target.h
··· 161 161 #define TCG_TARGET_HAS_not_vec 1 162 162 #define TCG_TARGET_HAS_neg_vec have_isa_3_00 163 163 #define TCG_TARGET_HAS_abs_vec 0 164 + #define TCG_TARGET_HAS_roti_vec 0 164 165 #define TCG_TARGET_HAS_shi_vec 0 165 166 #define TCG_TARGET_HAS_shs_vec 0 166 167 #define TCG_TARGET_HAS_shv_vec 1
+68
tcg/tcg-op-gvec.c
··· 2694 2694 } 2695 2695 } 2696 2696 2697 + void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2698 + { 2699 + uint64_t mask = dup_const(MO_8, 0xff << c); 2700 + 2701 + tcg_gen_shli_i64(d, a, c); 2702 + tcg_gen_shri_i64(a, a, 8 - c); 2703 + tcg_gen_andi_i64(d, d, mask); 2704 + tcg_gen_andi_i64(a, a, ~mask); 2705 + tcg_gen_or_i64(d, d, a); 2706 + } 2707 + 2708 + void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2709 + { 2710 + uint64_t mask = dup_const(MO_16, 0xffff << c); 2711 + 2712 + tcg_gen_shli_i64(d, a, c); 2713 + tcg_gen_shri_i64(a, a, 16 - c); 2714 + tcg_gen_andi_i64(d, d, mask); 2715 + tcg_gen_andi_i64(a, a, ~mask); 2716 + tcg_gen_or_i64(d, d, a); 2717 + } 2718 + 2719 + void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 2720 + int64_t shift, uint32_t oprsz, uint32_t maxsz) 2721 + { 2722 + static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 2723 + static const GVecGen2i g[4] = { 2724 + { .fni8 = tcg_gen_vec_rotl8i_i64, 2725 + .fniv = tcg_gen_rotli_vec, 2726 + .fno = gen_helper_gvec_rotl8i, 2727 + .opt_opc = vecop_list, 2728 + .vece = MO_8 }, 2729 + { .fni8 = tcg_gen_vec_rotl16i_i64, 2730 + .fniv = tcg_gen_rotli_vec, 2731 + .fno = gen_helper_gvec_rotl16i, 2732 + .opt_opc = vecop_list, 2733 + .vece = MO_16 }, 2734 + { .fni4 = tcg_gen_rotli_i32, 2735 + .fniv = tcg_gen_rotli_vec, 2736 + .fno = gen_helper_gvec_rotl32i, 2737 + .opt_opc = vecop_list, 2738 + .vece = MO_32 }, 2739 + { .fni8 = tcg_gen_rotli_i64, 2740 + .fniv = tcg_gen_rotli_vec, 2741 + .fno = gen_helper_gvec_rotl64i, 2742 + .opt_opc = vecop_list, 2743 + .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2744 + .vece = MO_64 }, 2745 + }; 2746 + 2747 + tcg_debug_assert(vece <= MO_64); 2748 + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2749 + if (shift == 0) { 2750 + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2751 + } else { 2752 + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2753 + } 2754 + } 2755 + 2756 + void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 2757 + int64_t shift, uint32_t oprsz, uint32_t maxsz) 2758 + { 2759 + tcg_debug_assert(vece <= MO_64); 2760 + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2761 + tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 2762 + oprsz, maxsz); 2763 + } 2764 + 2697 2765 /* 2698 2766 * Specialized generation vector shifts by a non-constant scalar. 2699 2767 */
+12
tcg/tcg-op-vec.c
··· 545 545 do_shifti(INDEX_op_sari_vec, vece, r, a, i); 546 546 } 547 547 548 + void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) 549 + { 550 + do_shifti(INDEX_op_rotli_vec, vece, r, a, i); 551 + } 552 + 553 + void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) 554 + { 555 + int bits = 8 << vece; 556 + tcg_debug_assert(i >= 0 && i < bits); 557 + do_shifti(INDEX_op_rotli_vec, vece, r, a, -i & (bits - 1)); 558 + } 559 + 548 560 void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, 549 561 TCGv_vec r, TCGv_vec a, TCGv_vec b) 550 562 {
+2
tcg/tcg.c
··· 1661 1661 case INDEX_op_shrv_vec: 1662 1662 case INDEX_op_sarv_vec: 1663 1663 return have_vec && TCG_TARGET_HAS_shv_vec; 1664 + case INDEX_op_rotli_vec: 1665 + return have_vec && TCG_TARGET_HAS_roti_vec; 1664 1666 case INDEX_op_ssadd_vec: 1665 1667 case INDEX_op_usadd_vec: 1666 1668 case INDEX_op_sssub_vec: