qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

tests/fp: add fp-bench

These microbenchmarks will allow us to measure the performance impact of
FP emulation optimizations. Note that we can measure both directly the impact
on the softfloat functions (with "-t soft"), or the impact on an
emulated workload (call with "-t host" and run under qemu user-mode).

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

authored by

Emilio G. Cota and committed by
Alex Bennée
25f539f3 315df0d1

+635 -1
+1
tests/fp/.gitignore
··· 1 1 fp-test 2 + fp-bench
+4 -1
tests/fp/Makefile
··· 553 553 TF_OBJS_LIB += testLoops_common.o 554 554 TF_OBJS_LIB += $(TF_OBJS_TEST) 555 555 556 - BINARIES := fp-test$(EXESUF) 556 + BINARIES := fp-test$(EXESUF) fp-bench$(EXESUF) 557 557 558 558 # everything depends on config-host.h because platform.h includes it 559 559 all: $(BUILD_DIR)/config-host.h ··· 590 590 591 591 libtestfloat.a: $(TF_OBJS_LIB) 592 592 593 + fp-bench$(EXESUF): fp-bench.o $(QEMU_SOFTFLOAT_OBJ) $(LIBQEMUUTIL) 594 + 593 595 clean: 594 596 rm -f *.o *.d $(BINARIES) 595 597 rm -f *.gcno *.gcda *.gcov 596 598 rm -f fp-test$(EXESUF) 599 + rm -f fp-bench$(EXESUF) 597 600 rm -f libsoftfloat.a 598 601 rm -f libtestfloat.a 599 602
+630
tests/fp/fp-bench.c
··· 1 + /* 2 + * fp-bench.c - A collection of simple floating point microbenchmarks. 3 + * 4 + * Copyright (C) 2018, Emilio G. Cota <cota@braap.org> 5 + * 6 + * License: GNU GPL, version 2 or later. 7 + * See the COPYING file in the top-level directory. 8 + */ 9 + #ifndef HW_POISON_H 10 + #error Must define HW_POISON_H to work around TARGET_* poisoning 11 + #endif 12 + 13 + #include "qemu/osdep.h" 14 + #include <math.h> 15 + #include <fenv.h> 16 + #include "qemu/timer.h" 17 + #include "fpu/softfloat.h" 18 + 19 + /* amortize the computation of random inputs */ 20 + #define OPS_PER_ITER 50000 21 + 22 + #define MAX_OPERANDS 3 23 + 24 + #define SEED_A 0xdeadfacedeadface 25 + #define SEED_B 0xbadc0feebadc0fee 26 + #define SEED_C 0xbeefdeadbeefdead 27 + 28 + enum op { 29 + OP_ADD, 30 + OP_SUB, 31 + OP_MUL, 32 + OP_DIV, 33 + OP_FMA, 34 + OP_SQRT, 35 + OP_CMP, 36 + OP_MAX_NR, 37 + }; 38 + 39 + static const char * const op_names[] = { 40 + [OP_ADD] = "add", 41 + [OP_SUB] = "sub", 42 + [OP_MUL] = "mul", 43 + [OP_DIV] = "div", 44 + [OP_FMA] = "mulAdd", 45 + [OP_SQRT] = "sqrt", 46 + [OP_CMP] = "cmp", 47 + [OP_MAX_NR] = NULL, 48 + }; 49 + 50 + enum precision { 51 + PREC_SINGLE, 52 + PREC_DOUBLE, 53 + PREC_FLOAT32, 54 + PREC_FLOAT64, 55 + PREC_MAX_NR, 56 + }; 57 + 58 + enum rounding { 59 + ROUND_EVEN, 60 + ROUND_ZERO, 61 + ROUND_DOWN, 62 + ROUND_UP, 63 + ROUND_TIEAWAY, 64 + N_ROUND_MODES, 65 + }; 66 + 67 + static const char * const round_names[] = { 68 + [ROUND_EVEN] = "even", 69 + [ROUND_ZERO] = "zero", 70 + [ROUND_DOWN] = "down", 71 + [ROUND_UP] = "up", 72 + [ROUND_TIEAWAY] = "tieaway", 73 + }; 74 + 75 + enum tester { 76 + TESTER_SOFT, 77 + TESTER_HOST, 78 + TESTER_MAX_NR, 79 + }; 80 + 81 + static const char * const tester_names[] = { 82 + [TESTER_SOFT] = "soft", 83 + [TESTER_HOST] = "host", 84 + [TESTER_MAX_NR] = NULL, 85 + }; 86 + 87 + union fp { 88 + float f; 89 + double d; 90 + float32 f32; 91 + float64 f64; 92 + uint64_t u64; 93 + }; 94 + 95 + struct op_state; 96 + 97 + typedef float (*float_func_t)(const struct op_state *s); 98 + typedef double (*double_func_t)(const struct op_state *s); 99 + 100 + union fp_func { 101 + float_func_t float_func; 102 + double_func_t double_func; 103 + }; 104 + 105 + typedef void (*bench_func_t)(void); 106 + 107 + struct op_desc { 108 + const char * const name; 109 + }; 110 + 111 + #define DEFAULT_DURATION_SECS 1 112 + 113 + static uint64_t random_ops[MAX_OPERANDS] = { 114 + SEED_A, SEED_B, SEED_C, 115 + }; 116 + static float_status soft_status; 117 + static enum precision precision; 118 + static enum op operation; 119 + static enum tester tester; 120 + static uint64_t n_completed_ops; 121 + static unsigned int duration = DEFAULT_DURATION_SECS; 122 + static int64_t ns_elapsed; 123 + /* disable optimizations with volatile */ 124 + static volatile union fp res; 125 + 126 + /* 127 + * From: https://en.wikipedia.org/wiki/Xorshift 128 + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only 129 + * guaranteed to be >= INT_MAX). 130 + */ 131 + static uint64_t xorshift64star(uint64_t x) 132 + { 133 + x ^= x >> 12; /* a */ 134 + x ^= x << 25; /* b */ 135 + x ^= x >> 27; /* c */ 136 + return x * UINT64_C(2685821657736338717); 137 + } 138 + 139 + static void update_random_ops(int n_ops, enum precision prec) 140 + { 141 + int i; 142 + 143 + for (i = 0; i < n_ops; i++) { 144 + uint64_t r = random_ops[i]; 145 + 146 + if (prec == PREC_SINGLE || PREC_FLOAT32) { 147 + do { 148 + r = xorshift64star(r); 149 + } while (!float32_is_normal(r)); 150 + } else if (prec == PREC_DOUBLE || PREC_FLOAT64) { 151 + do { 152 + r = xorshift64star(r); 153 + } while (!float64_is_normal(r)); 154 + } else { 155 + g_assert_not_reached(); 156 + } 157 + random_ops[i] = r; 158 + } 159 + } 160 + 161 + static void fill_random(union fp *ops, int n_ops, enum precision prec, 162 + bool no_neg) 163 + { 164 + int i; 165 + 166 + for (i = 0; i < n_ops; i++) { 167 + switch (prec) { 168 + case PREC_SINGLE: 169 + case PREC_FLOAT32: 170 + ops[i].f32 = make_float32(random_ops[i]); 171 + if (no_neg && float32_is_neg(ops[i].f32)) { 172 + ops[i].f32 = float32_chs(ops[i].f32); 173 + } 174 + /* raise the exponent to limit the frequency of denormal results */ 175 + ops[i].f32 |= 0x40000000; 176 + break; 177 + case PREC_DOUBLE: 178 + case PREC_FLOAT64: 179 + ops[i].f64 = make_float64(random_ops[i]); 180 + if (no_neg && float64_is_neg(ops[i].f64)) { 181 + ops[i].f64 = float64_chs(ops[i].f64); 182 + } 183 + /* raise the exponent to limit the frequency of denormal results */ 184 + ops[i].f64 |= LIT64(0x4000000000000000); 185 + break; 186 + default: 187 + g_assert_not_reached(); 188 + } 189 + } 190 + } 191 + 192 + /* 193 + * The main benchmark function. Instead of (ab)using macros, we rely 194 + * on the compiler to unfold this at compile-time. 195 + */ 196 + static void bench(enum precision prec, enum op op, int n_ops, bool no_neg) 197 + { 198 + int64_t tf = get_clock() + duration * 1000000000LL; 199 + 200 + while (get_clock() < tf) { 201 + union fp ops[MAX_OPERANDS]; 202 + int64_t t0; 203 + int i; 204 + 205 + update_random_ops(n_ops, prec); 206 + switch (prec) { 207 + case PREC_SINGLE: 208 + fill_random(ops, n_ops, prec, no_neg); 209 + t0 = get_clock(); 210 + for (i = 0; i < OPS_PER_ITER; i++) { 211 + float a = ops[0].f; 212 + float b = ops[1].f; 213 + float c = ops[2].f; 214 + 215 + switch (op) { 216 + case OP_ADD: 217 + res.f = a + b; 218 + break; 219 + case OP_SUB: 220 + res.f = a - b; 221 + break; 222 + case OP_MUL: 223 + res.f = a * b; 224 + break; 225 + case OP_DIV: 226 + res.f = a / b; 227 + break; 228 + case OP_FMA: 229 + res.f = fmaf(a, b, c); 230 + break; 231 + case OP_SQRT: 232 + res.f = sqrtf(a); 233 + break; 234 + case OP_CMP: 235 + res.u64 = isgreater(a, b); 236 + break; 237 + default: 238 + g_assert_not_reached(); 239 + } 240 + } 241 + break; 242 + case PREC_DOUBLE: 243 + fill_random(ops, n_ops, prec, no_neg); 244 + t0 = get_clock(); 245 + for (i = 0; i < OPS_PER_ITER; i++) { 246 + double a = ops[0].d; 247 + double b = ops[1].d; 248 + double c = ops[2].d; 249 + 250 + switch (op) { 251 + case OP_ADD: 252 + res.d = a + b; 253 + break; 254 + case OP_SUB: 255 + res.d = a - b; 256 + break; 257 + case OP_MUL: 258 + res.d = a * b; 259 + break; 260 + case OP_DIV: 261 + res.d = a / b; 262 + break; 263 + case OP_FMA: 264 + res.d = fma(a, b, c); 265 + break; 266 + case OP_SQRT: 267 + res.d = sqrt(a); 268 + break; 269 + case OP_CMP: 270 + res.u64 = isgreater(a, b); 271 + break; 272 + default: 273 + g_assert_not_reached(); 274 + } 275 + } 276 + break; 277 + case PREC_FLOAT32: 278 + fill_random(ops, n_ops, prec, no_neg); 279 + t0 = get_clock(); 280 + for (i = 0; i < OPS_PER_ITER; i++) { 281 + float32 a = ops[0].f32; 282 + float32 b = ops[1].f32; 283 + float32 c = ops[2].f32; 284 + 285 + switch (op) { 286 + case OP_ADD: 287 + res.f32 = float32_add(a, b, &soft_status); 288 + break; 289 + case OP_SUB: 290 + res.f32 = float32_sub(a, b, &soft_status); 291 + break; 292 + case OP_MUL: 293 + res.f = float32_mul(a, b, &soft_status); 294 + break; 295 + case OP_DIV: 296 + res.f32 = float32_div(a, b, &soft_status); 297 + break; 298 + case OP_FMA: 299 + res.f32 = float32_muladd(a, b, c, 0, &soft_status); 300 + break; 301 + case OP_SQRT: 302 + res.f32 = float32_sqrt(a, &soft_status); 303 + break; 304 + case OP_CMP: 305 + res.u64 = float32_compare_quiet(a, b, &soft_status); 306 + break; 307 + default: 308 + g_assert_not_reached(); 309 + } 310 + } 311 + break; 312 + case PREC_FLOAT64: 313 + fill_random(ops, n_ops, prec, no_neg); 314 + t0 = get_clock(); 315 + for (i = 0; i < OPS_PER_ITER; i++) { 316 + float64 a = ops[0].f64; 317 + float64 b = ops[1].f64; 318 + float64 c = ops[2].f64; 319 + 320 + switch (op) { 321 + case OP_ADD: 322 + res.f64 = float64_add(a, b, &soft_status); 323 + break; 324 + case OP_SUB: 325 + res.f64 = float64_sub(a, b, &soft_status); 326 + break; 327 + case OP_MUL: 328 + res.f = float64_mul(a, b, &soft_status); 329 + break; 330 + case OP_DIV: 331 + res.f64 = float64_div(a, b, &soft_status); 332 + break; 333 + case OP_FMA: 334 + res.f64 = float64_muladd(a, b, c, 0, &soft_status); 335 + break; 336 + case OP_SQRT: 337 + res.f64 = float64_sqrt(a, &soft_status); 338 + break; 339 + case OP_CMP: 340 + res.u64 = float64_compare_quiet(a, b, &soft_status); 341 + break; 342 + default: 343 + g_assert_not_reached(); 344 + } 345 + } 346 + break; 347 + default: 348 + g_assert_not_reached(); 349 + } 350 + ns_elapsed += get_clock() - t0; 351 + n_completed_ops += OPS_PER_ITER; 352 + } 353 + } 354 + 355 + #define GEN_BENCH(name, type, prec, op, n_ops) \ 356 + static void __attribute__((flatten)) name(void) \ 357 + { \ 358 + bench(prec, op, n_ops, false); \ 359 + } 360 + 361 + #define GEN_BENCH_NO_NEG(name, type, prec, op, n_ops) \ 362 + static void __attribute__((flatten)) name(void) \ 363 + { \ 364 + bench(prec, op, n_ops, true); \ 365 + } 366 + 367 + #define GEN_BENCH_ALL_TYPES(opname, op, n_ops) \ 368 + GEN_BENCH(bench_ ## opname ## _float, float, PREC_SINGLE, op, n_ops) \ 369 + GEN_BENCH(bench_ ## opname ## _double, double, PREC_DOUBLE, op, n_ops) \ 370 + GEN_BENCH(bench_ ## opname ## _float32, float32, PREC_FLOAT32, op, n_ops) \ 371 + GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops) 372 + 373 + GEN_BENCH_ALL_TYPES(add, OP_ADD, 2) 374 + GEN_BENCH_ALL_TYPES(sub, OP_SUB, 2) 375 + GEN_BENCH_ALL_TYPES(mul, OP_MUL, 2) 376 + GEN_BENCH_ALL_TYPES(div, OP_DIV, 2) 377 + GEN_BENCH_ALL_TYPES(fma, OP_FMA, 3) 378 + GEN_BENCH_ALL_TYPES(cmp, OP_CMP, 2) 379 + #undef GEN_BENCH_ALL_TYPES 380 + 381 + #define GEN_BENCH_ALL_TYPES_NO_NEG(name, op, n) \ 382 + GEN_BENCH_NO_NEG(bench_ ## name ## _float, float, PREC_SINGLE, op, n) \ 383 + GEN_BENCH_NO_NEG(bench_ ## name ## _double, double, PREC_DOUBLE, op, n) \ 384 + GEN_BENCH_NO_NEG(bench_ ## name ## _float32, float32, PREC_FLOAT32, op, n) \ 385 + GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n) 386 + 387 + GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1) 388 + #undef GEN_BENCH_ALL_TYPES_NO_NEG 389 + 390 + #undef GEN_BENCH_NO_NEG 391 + #undef GEN_BENCH 392 + 393 + #define GEN_BENCH_FUNCS(opname, op) \ 394 + [op] = { \ 395 + [PREC_SINGLE] = bench_ ## opname ## _float, \ 396 + [PREC_DOUBLE] = bench_ ## opname ## _double, \ 397 + [PREC_FLOAT32] = bench_ ## opname ## _float32, \ 398 + [PREC_FLOAT64] = bench_ ## opname ## _float64, \ 399 + } 400 + 401 + static const bench_func_t bench_funcs[OP_MAX_NR][PREC_MAX_NR] = { 402 + GEN_BENCH_FUNCS(add, OP_ADD), 403 + GEN_BENCH_FUNCS(sub, OP_SUB), 404 + GEN_BENCH_FUNCS(mul, OP_MUL), 405 + GEN_BENCH_FUNCS(div, OP_DIV), 406 + GEN_BENCH_FUNCS(fma, OP_FMA), 407 + GEN_BENCH_FUNCS(sqrt, OP_SQRT), 408 + GEN_BENCH_FUNCS(cmp, OP_CMP), 409 + }; 410 + 411 + #undef GEN_BENCH_FUNCS 412 + 413 + static void run_bench(void) 414 + { 415 + bench_func_t f; 416 + 417 + f = bench_funcs[operation][precision]; 418 + g_assert(f); 419 + f(); 420 + } 421 + 422 + /* @arr must be NULL-terminated */ 423 + static int find_name(const char * const *arr, const char *name) 424 + { 425 + int i; 426 + 427 + for (i = 0; arr[i] != NULL; i++) { 428 + if (strcmp(name, arr[i]) == 0) { 429 + return i; 430 + } 431 + } 432 + return -1; 433 + } 434 + 435 + static void usage_complete(int argc, char *argv[]) 436 + { 437 + gchar *op_list = g_strjoinv(", ", (gchar **)op_names); 438 + gchar *tester_list = g_strjoinv(", ", (gchar **)tester_names); 439 + 440 + fprintf(stderr, "Usage: %s [options]\n", argv[0]); 441 + fprintf(stderr, "options:\n"); 442 + fprintf(stderr, " -d = duration, in seconds. Default: %d\n", 443 + DEFAULT_DURATION_SECS); 444 + fprintf(stderr, " -h = show this help message.\n"); 445 + fprintf(stderr, " -o = floating point operation (%s). Default: %s\n", 446 + op_list, op_names[0]); 447 + fprintf(stderr, " -p = floating point precision (single, double). " 448 + "Default: single\n"); 449 + fprintf(stderr, " -r = rounding mode (even, zero, down, up, tieaway). " 450 + "Default: even\n"); 451 + fprintf(stderr, " -t = tester (%s). Default: %s\n", 452 + tester_list, tester_names[0]); 453 + fprintf(stderr, " -z = flush inputs to zero (soft tester only). " 454 + "Default: disabled\n"); 455 + fprintf(stderr, " -Z = flush output to zero (soft tester only). " 456 + "Default: disabled\n"); 457 + 458 + g_free(tester_list); 459 + g_free(op_list); 460 + } 461 + 462 + static int round_name_to_mode(const char *name) 463 + { 464 + int i; 465 + 466 + for (i = 0; i < N_ROUND_MODES; i++) { 467 + if (!strcmp(round_names[i], name)) { 468 + return i; 469 + } 470 + } 471 + return -1; 472 + } 473 + 474 + static void QEMU_NORETURN die_host_rounding(enum rounding rounding) 475 + { 476 + fprintf(stderr, "fatal: '%s' rounding not supported on this host\n", 477 + round_names[rounding]); 478 + exit(EXIT_FAILURE); 479 + } 480 + 481 + static void set_host_precision(enum rounding rounding) 482 + { 483 + int rhost; 484 + 485 + switch (rounding) { 486 + case ROUND_EVEN: 487 + rhost = FE_TONEAREST; 488 + break; 489 + case ROUND_ZERO: 490 + rhost = FE_TOWARDZERO; 491 + break; 492 + case ROUND_DOWN: 493 + rhost = FE_DOWNWARD; 494 + break; 495 + case ROUND_UP: 496 + rhost = FE_UPWARD; 497 + break; 498 + case ROUND_TIEAWAY: 499 + die_host_rounding(rounding); 500 + return; 501 + default: 502 + g_assert_not_reached(); 503 + } 504 + 505 + if (fesetround(rhost)) { 506 + die_host_rounding(rounding); 507 + } 508 + } 509 + 510 + static void set_soft_precision(enum rounding rounding) 511 + { 512 + signed char mode; 513 + 514 + switch (rounding) { 515 + case ROUND_EVEN: 516 + mode = float_round_nearest_even; 517 + break; 518 + case ROUND_ZERO: 519 + mode = float_round_to_zero; 520 + break; 521 + case ROUND_DOWN: 522 + mode = float_round_down; 523 + break; 524 + case ROUND_UP: 525 + mode = float_round_up; 526 + break; 527 + case ROUND_TIEAWAY: 528 + mode = float_round_ties_away; 529 + break; 530 + default: 531 + g_assert_not_reached(); 532 + } 533 + soft_status.float_rounding_mode = mode; 534 + } 535 + 536 + static void parse_args(int argc, char *argv[]) 537 + { 538 + int c; 539 + int val; 540 + int rounding = ROUND_EVEN; 541 + 542 + for (;;) { 543 + c = getopt(argc, argv, "d:ho:p:r:t:zZ"); 544 + if (c < 0) { 545 + break; 546 + } 547 + switch (c) { 548 + case 'd': 549 + duration = atoi(optarg); 550 + break; 551 + case 'h': 552 + usage_complete(argc, argv); 553 + exit(EXIT_SUCCESS); 554 + case 'o': 555 + val = find_name(op_names, optarg); 556 + if (val < 0) { 557 + fprintf(stderr, "Unsupported op '%s'\n", optarg); 558 + exit(EXIT_FAILURE); 559 + } 560 + operation = val; 561 + break; 562 + case 'p': 563 + if (!strcmp(optarg, "single")) { 564 + precision = PREC_SINGLE; 565 + } else if (!strcmp(optarg, "double")) { 566 + precision = PREC_DOUBLE; 567 + } else { 568 + fprintf(stderr, "Unsupported precision '%s'\n", optarg); 569 + exit(EXIT_FAILURE); 570 + } 571 + break; 572 + case 'r': 573 + rounding = round_name_to_mode(optarg); 574 + if (rounding < 0) { 575 + fprintf(stderr, "fatal: invalid rounding mode '%s'\n", optarg); 576 + exit(EXIT_FAILURE); 577 + } 578 + break; 579 + case 't': 580 + val = find_name(tester_names, optarg); 581 + if (val < 0) { 582 + fprintf(stderr, "Unsupported tester '%s'\n", optarg); 583 + exit(EXIT_FAILURE); 584 + } 585 + tester = val; 586 + break; 587 + case 'z': 588 + soft_status.flush_inputs_to_zero = 1; 589 + break; 590 + case 'Z': 591 + soft_status.flush_to_zero = 1; 592 + break; 593 + } 594 + } 595 + 596 + /* set precision and rounding mode based on the tester */ 597 + switch (tester) { 598 + case TESTER_HOST: 599 + set_host_precision(rounding); 600 + break; 601 + case TESTER_SOFT: 602 + set_soft_precision(rounding); 603 + switch (precision) { 604 + case PREC_SINGLE: 605 + precision = PREC_FLOAT32; 606 + break; 607 + case PREC_DOUBLE: 608 + precision = PREC_FLOAT64; 609 + break; 610 + default: 611 + g_assert_not_reached(); 612 + } 613 + break; 614 + default: 615 + g_assert_not_reached(); 616 + } 617 + } 618 + 619 + static void pr_stats(void) 620 + { 621 + printf("%.2f MFlops\n", (double)n_completed_ops / ns_elapsed * 1e3); 622 + } 623 + 624 + int main(int argc, char *argv[]) 625 + { 626 + parse_args(argc, argv); 627 + run_bench(); 628 + pr_stats(); 629 + return 0; 630 + }