qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

tcg: define CF_PARALLEL and use it for TB hashing along with CF_COUNT_MASK

This will enable us to decouple code translation from the value
of parallel_cpus at any given time. It will also help us minimize
TB flushes when generating code via EXCP_ATOMIC.

Note that the declaration of parallel_cpus is brought to exec-all.h
to be able to define there the "curr_cflags" inline.

Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

authored by

Emilio G. Cota and committed by
Richard Henderson
4e2ca83e e89b28a6

+65 -39
+23 -22
accel/tcg/cpu-exec.c
··· 207 207 tb_lock(); 208 208 tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, 209 209 max_cycles | CF_NOCACHE 210 - | (ignore_icount ? CF_IGNORE_ICOUNT : 0)); 210 + | (ignore_icount ? CF_IGNORE_ICOUNT : 0) 211 + | curr_cflags()); 211 212 tb->orig_tb = orig_tb; 212 213 tb_unlock(); 213 214 ··· 225 226 static void cpu_exec_step(CPUState *cpu) 226 227 { 227 228 CPUClass *cc = CPU_GET_CLASS(cpu); 228 - CPUArchState *env = (CPUArchState *)cpu->env_ptr; 229 229 TranslationBlock *tb; 230 230 target_ulong cs_base, pc; 231 231 uint32_t flags; 232 + uint32_t cflags = 1 | CF_IGNORE_ICOUNT; 232 233 233 - cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); 234 234 if (sigsetjmp(cpu->jmp_env, 0) == 0) { 235 - mmap_lock(); 236 - tb_lock(); 237 - tb = tb_gen_code(cpu, pc, cs_base, flags, 238 - 1 | CF_NOCACHE | CF_IGNORE_ICOUNT); 239 - tb->orig_tb = NULL; 240 - tb_unlock(); 241 - mmap_unlock(); 235 + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, 236 + cflags & CF_HASH_MASK); 237 + if (tb == NULL) { 238 + mmap_lock(); 239 + tb_lock(); 240 + tb = tb_gen_code(cpu, pc, cs_base, flags, cflags); 241 + tb_unlock(); 242 + mmap_unlock(); 243 + } 242 244 243 245 cc->cpu_exec_enter(cpu); 244 246 /* execute the generated code */ 245 - trace_exec_tb_nocache(tb, pc); 247 + trace_exec_tb(tb, pc); 246 248 cpu_tb_exec(cpu, tb); 247 249 cc->cpu_exec_exit(cpu); 248 - 249 - tb_lock(); 250 - tb_phys_invalidate(tb, -1); 251 - tb_free(tb); 252 - tb_unlock(); 253 250 } else { 254 251 /* We may have exited due to another problem here, so we need 255 252 * to reset any tb_locks we may have taken but didn't release. ··· 281 278 CPUArchState *env; 282 279 tb_page_addr_t phys_page1; 283 280 uint32_t flags; 281 + uint32_t cf_mask; 284 282 uint32_t trace_vcpu_dstate; 285 283 }; 286 284 ··· 294 292 tb->cs_base == desc->cs_base && 295 293 tb->flags == desc->flags && 296 294 tb->trace_vcpu_dstate == desc->trace_vcpu_dstate && 297 - !(atomic_read(&tb->cflags) & CF_INVALID)) { 295 + (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) { 298 296 /* check next page if needed */ 299 297 if (tb->page_addr[1] == -1) { 300 298 return true; ··· 313 311 } 314 312 315 313 TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, 316 - target_ulong cs_base, uint32_t flags) 314 + target_ulong cs_base, uint32_t flags, 315 + uint32_t cf_mask) 317 316 { 318 317 tb_page_addr_t phys_pc; 319 318 struct tb_desc desc; ··· 322 321 desc.env = (CPUArchState *)cpu->env_ptr; 323 322 desc.cs_base = cs_base; 324 323 desc.flags = flags; 324 + desc.cf_mask = cf_mask; 325 325 desc.trace_vcpu_dstate = *cpu->trace_dstate; 326 326 desc.pc = pc; 327 327 phys_pc = get_page_addr_code(desc.env, pc); 328 328 desc.phys_page1 = phys_pc & TARGET_PAGE_MASK; 329 - h = tb_hash_func(phys_pc, pc, flags, *cpu->trace_dstate); 329 + h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate); 330 330 return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h); 331 331 } 332 332 ··· 373 373 target_ulong cs_base, pc; 374 374 uint32_t flags; 375 375 bool acquired_tb_lock = false; 376 + uint32_t cf_mask = curr_cflags(); 376 377 377 - tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags); 378 + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask); 378 379 if (tb == NULL) { 379 380 /* mmap_lock is needed by tb_gen_code, and mmap_lock must be 380 381 * taken outside tb_lock. As system emulation is currently ··· 387 388 /* There's a chance that our desired tb has been translated while 388 389 * taking the locks so we check again inside the lock. 389 390 */ 390 - tb = tb_htable_lookup(cpu, pc, cs_base, flags); 391 + tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask); 391 392 if (likely(tb == NULL)) { 392 393 /* if no translated code available, then translate it now */ 393 - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); 394 + tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask); 394 395 } 395 396 396 397 mmap_unlock();
+1 -1
accel/tcg/tcg-runtime.c
··· 151 151 target_ulong cs_base, pc; 152 152 uint32_t flags; 153 153 154 - tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags); 154 + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags()); 155 155 if (tb == NULL) { 156 156 return tcg_ctx.code_gen_epilogue; 157 157 }
+9 -4
accel/tcg/translate-all.c
··· 1101 1101 1102 1102 /* remove the TB from the hash list */ 1103 1103 phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); 1104 - h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate); 1104 + h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK, 1105 + tb->trace_vcpu_dstate); 1105 1106 qht_remove(&tcg_ctx.tb_ctx.htable, tb, h); 1106 1107 1107 1108 /* remove the TB from the page list */ ··· 1245 1246 } 1246 1247 1247 1248 /* add in the hash table */ 1248 - h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate); 1249 + h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK, 1250 + tb->trace_vcpu_dstate); 1249 1251 qht_insert(&tcg_ctx.tb_ctx.htable, tb, h); 1250 1252 1251 1253 #ifdef CONFIG_USER_ONLY ··· 1548 1550 /* we generate a block containing just the instruction 1549 1551 modifying the memory. It will ensure that it cannot modify 1550 1552 itself */ 1551 - tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1); 1553 + tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1554 + 1 | curr_cflags()); 1552 1555 cpu_loop_exit_noexc(cpu); 1553 1556 } 1554 1557 #endif ··· 1666 1669 /* we generate a block containing just the instruction 1667 1670 modifying the memory. It will ensure that it cannot modify 1668 1671 itself */ 1669 - tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1); 1672 + tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1673 + 1 | curr_cflags()); 1670 1674 /* tb_lock will be reset after cpu_loop_exit_noexc longjmps 1671 1675 * back into the cpu_exec loop. */ 1672 1676 return true; ··· 1810 1814 } 1811 1815 1812 1816 cflags = n | CF_LAST_IO; 1817 + cflags |= curr_cflags(); 1813 1818 pc = tb->pc; 1814 1819 cs_base = tb->cs_base; 1815 1820 flags = tb->flags;
+1 -1
exec.c
··· 2476 2476 cpu_loop_exit(cpu); 2477 2477 } else { 2478 2478 cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags); 2479 - tb_gen_code(cpu, pc, cs_base, cpu_flags, 1); 2479 + tb_gen_code(cpu, pc, cs_base, cpu_flags, 1 | curr_cflags()); 2480 2480 cpu_loop_exit_noexc(cpu); 2481 2481 } 2482 2482 }
+19 -1
include/exec/exec-all.h
··· 325 325 #define CF_USE_ICOUNT 0x20000 326 326 #define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */ 327 327 #define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */ 328 + #define CF_PARALLEL 0x100000 /* Generate code for a parallel context */ 329 + /* cflags' mask for hashing/comparison */ 330 + #define CF_HASH_MASK (CF_PARALLEL) 328 331 329 332 /* Per-vCPU dynamic tracing state used to generate this TB */ 330 333 uint32_t trace_vcpu_dstate; ··· 365 368 uintptr_t jmp_list_first; 366 369 }; 367 370 371 + extern bool parallel_cpus; 372 + 373 + /* Hide the atomic_read to make code a little easier on the eyes */ 374 + static inline uint32_t tb_cflags(const TranslationBlock *tb) 375 + { 376 + return atomic_read(&tb->cflags); 377 + } 378 + 379 + /* current cflags for hashing/comparison */ 380 + static inline uint32_t curr_cflags(void) 381 + { 382 + return parallel_cpus ? CF_PARALLEL : 0; 383 + } 384 + 368 385 void tb_free(TranslationBlock *tb); 369 386 void tb_flush(CPUState *cpu); 370 387 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr); 371 388 TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, 372 - target_ulong cs_base, uint32_t flags); 389 + target_ulong cs_base, uint32_t flags, 390 + uint32_t cf_mask); 373 391 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr); 374 392 375 393 /* GETPC is the true target of the return instruction that we'll execute. */
+6 -3
include/exec/tb-hash-xx.h
··· 48 48 * xxhash32, customized for input variables that are not guaranteed to be 49 49 * contiguous in memory. 50 50 */ 51 - static inline 52 - uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f) 51 + static inline uint32_t 52 + tb_hash_func7(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f, uint32_t g) 53 53 { 54 54 uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2; 55 55 uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2; ··· 78 78 v4 *= PRIME32_1; 79 79 80 80 h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18); 81 - h32 += 24; 81 + h32 += 28; 82 82 83 83 h32 += e * PRIME32_3; 84 84 h32 = rol32(h32, 17) * PRIME32_4; 85 85 86 86 h32 += f * PRIME32_3; 87 + h32 = rol32(h32, 17) * PRIME32_4; 88 + 89 + h32 += g * PRIME32_3; 87 90 h32 = rol32(h32, 17) * PRIME32_4; 88 91 89 92 h32 ^= h32 >> 15;
+2 -2
include/exec/tb-hash.h
··· 59 59 60 60 static inline 61 61 uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags, 62 - uint32_t trace_vcpu_dstate) 62 + uint32_t cf_mask, uint32_t trace_vcpu_dstate) 63 63 { 64 - return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate); 64 + return tb_hash_func7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate); 65 65 } 66 66 67 67 #endif
+3 -3
include/exec/tb-lookup.h
··· 21 21 /* Might cause an exception, so have a longjmp destination ready */ 22 22 static inline TranslationBlock * 23 23 tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base, 24 - uint32_t *flags) 24 + uint32_t *flags, uint32_t cf_mask) 25 25 { 26 26 CPUArchState *env = (CPUArchState *)cpu->env_ptr; 27 27 TranslationBlock *tb; ··· 35 35 tb->cs_base == *cs_base && 36 36 tb->flags == *flags && 37 37 tb->trace_vcpu_dstate == *cpu->trace_dstate && 38 - !(atomic_read(&tb->cflags) & CF_INVALID))) { 38 + (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) { 39 39 return tb; 40 40 } 41 - tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags); 41 + tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask); 42 42 if (tb == NULL) { 43 43 return NULL; 44 44 }
-1
tcg/tcg.h
··· 690 690 }; 691 691 692 692 extern TCGContext tcg_ctx; 693 - extern bool parallel_cpus; 694 693 695 694 static inline size_t temp_idx(TCGTemp *ts) 696 695 {
+1 -1
tests/qht-bench.c
··· 103 103 104 104 static inline uint32_t h(unsigned long v) 105 105 { 106 - return tb_hash_func6(v, 0, 0, 0); 106 + return tb_hash_func7(v, 0, 0, 0, 0); 107 107 } 108 108 109 109 /*