qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

spapr: Improved placement of PCI host bridges in guest memory map

Currently, the MMIO space for accessing PCI on pseries guests begins at
1 TiB in guest address space. Each PCI host bridge (PHB) has a 64 GiB
chunk of address space in which it places its outbound PIO and 32-bit and
64-bit MMIO windows.

This scheme as several problems:
- It limits guest RAM to 1 TiB (though we have a limited fix for this
now)
- It limits the total MMIO window to 64 GiB. This is not always enough
for some of the large nVidia GPGPU cards
- Putting all the windows into a single 64 GiB area means that naturally
aligning things within there will waste more address space.
In addition there was a miscalculation in some of the defaults, which meant
that the MMIO windows for each PHB actually slightly overran the 64 GiB
region for that PHB. We got away without nasty consequences because
the overrun fit within an unused area at the beginning of the next PHB's
region, but it's not pretty.

This patch implements a new scheme which addresses those problems, and is
also closer to what bare metal hardware and pHyp guests generally use.

Because some guest versions (including most current distro kernels) can't
access PCI MMIO above 64 TiB, we put all the PCI windows between 32 TiB and
64 TiB. This is broken into 1 TiB chunks. The first 1 TiB contains the
PIO (64 kiB) and 32-bit MMIO (2 GiB) windows for all of the PHBs. Each
subsequent TiB chunk contains a naturally aligned 64-bit MMIO window for
one PHB each.

This reduces the number of allowed PHBs (without full manual configuration
of all the windows) from 256 to 31, but this should still be plenty in
practice.

We also change some of the default window sizes for manually configured
PHBs to saner values.

Finally we adjust some tests and libqos so that it correctly uses the new
default locations. Ideally it would parse the device tree given to the
guest, but that's a more complex problem for another time.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>

+109 -40
+92 -30
hw/ppc/spapr.c
··· 2375 2375 hwaddr *mmio32, hwaddr *mmio64, 2376 2376 unsigned n_dma, uint32_t *liobns, Error **errp) 2377 2377 { 2378 + /* 2379 + * New-style PHB window placement. 2380 + * 2381 + * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window 2382 + * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO 2383 + * windows. 2384 + * 2385 + * Some guest kernels can't work with MMIO windows above 1<<46 2386 + * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB 2387 + * 2388 + * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each 2389 + * PHB stacked together. (32TiB+2GiB)..(32TiB+64GiB) contains the 2390 + * 2GiB 32-bit MMIO windows for each PHB. Then 33..64TiB has the 2391 + * 1TiB 64-bit MMIO windows for each PHB. 2392 + */ 2378 2393 const uint64_t base_buid = 0x800000020000000ULL; 2379 - const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */ 2380 - const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */ 2381 - const hwaddr pio_offset = 0x80000000; /* 2 GiB */ 2382 - const uint32_t max_index = 255; 2383 - const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */ 2384 - 2385 - uint64_t ram_top = MACHINE(spapr)->ram_size; 2386 - hwaddr phb0_base, phb_base; 2394 + const int max_phbs = 2395 + (SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / SPAPR_PCI_MEM64_WIN_SIZE - 1; 2387 2396 int i; 2388 2397 2389 - /* Do we have hotpluggable memory? */ 2390 - if (MACHINE(spapr)->maxram_size > ram_top) { 2391 - /* Can't just use maxram_size, because there may be an 2392 - * alignment gap between normal and hotpluggable memory 2393 - * regions */ 2394 - ram_top = spapr->hotplug_memory.base + 2395 - memory_region_size(&spapr->hotplug_memory.mr); 2396 - } 2398 + /* Sanity check natural alignments */ 2399 + QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0); 2400 + QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0); 2401 + QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0); 2402 + QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0); 2403 + /* Sanity check bounds */ 2404 + QEMU_BUILD_BUG_ON((max_phbs * SPAPR_PCI_IO_WIN_SIZE) > SPAPR_PCI_MEM32_WIN_SIZE); 2405 + QEMU_BUILD_BUG_ON((max_phbs * SPAPR_PCI_MEM32_WIN_SIZE) > SPAPR_PCI_MEM64_WIN_SIZE); 2397 2406 2398 - phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment); 2399 - 2400 - if (index > max_index) { 2407 + if (index >= max_phbs) { 2401 2408 error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)", 2402 - max_index); 2409 + max_phbs - 1); 2403 2410 return; 2404 2411 } 2405 2412 ··· 2408 2415 liobns[i] = SPAPR_PCI_LIOBN(index, i); 2409 2416 } 2410 2417 2411 - phb_base = phb0_base + index * phb_spacing; 2412 - *pio = phb_base + pio_offset; 2413 - *mmio32 = phb_base + mmio_offset; 2414 - /* 2415 - * We don't set the 64-bit MMIO window, relying on the PHB's 2416 - * fallback behaviour of automatically splitting a large "32-bit" 2417 - * window into contiguous 32-bit and 64-bit windows 2418 - */ 2418 + *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE; 2419 + *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE; 2420 + *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE; 2419 2421 } 2420 2422 2421 2423 static void spapr_machine_class_init(ObjectClass *oc, void *data) ··· 2519 2521 /* 2520 2522 * pseries-2.7 2521 2523 */ 2522 - #define SPAPR_COMPAT_2_7 \ 2523 - HW_COMPAT_2_7 \ 2524 + #define SPAPR_COMPAT_2_7 \ 2525 + HW_COMPAT_2_7 \ 2526 + { \ 2527 + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE, \ 2528 + .property = "mem_win_size", \ 2529 + .value = stringify(SPAPR_PCI_2_7_MMIO_WIN_SIZE),\ 2530 + }, \ 2531 + { \ 2532 + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE, \ 2533 + .property = "mem64_win_size", \ 2534 + .value = "0", \ 2535 + }, 2536 + 2537 + static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index, 2538 + uint64_t *buid, hwaddr *pio, 2539 + hwaddr *mmio32, hwaddr *mmio64, 2540 + unsigned n_dma, uint32_t *liobns, Error **errp) 2541 + { 2542 + /* Legacy PHB placement for pseries-2.7 and earlier machine types */ 2543 + const uint64_t base_buid = 0x800000020000000ULL; 2544 + const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */ 2545 + const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */ 2546 + const hwaddr pio_offset = 0x80000000; /* 2 GiB */ 2547 + const uint32_t max_index = 255; 2548 + const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */ 2549 + 2550 + uint64_t ram_top = MACHINE(spapr)->ram_size; 2551 + hwaddr phb0_base, phb_base; 2552 + int i; 2553 + 2554 + /* Do we have hotpluggable memory? */ 2555 + if (MACHINE(spapr)->maxram_size > ram_top) { 2556 + /* Can't just use maxram_size, because there may be an 2557 + * alignment gap between normal and hotpluggable memory 2558 + * regions */ 2559 + ram_top = spapr->hotplug_memory.base + 2560 + memory_region_size(&spapr->hotplug_memory.mr); 2561 + } 2562 + 2563 + phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment); 2564 + 2565 + if (index > max_index) { 2566 + error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)", 2567 + max_index); 2568 + return; 2569 + } 2570 + 2571 + *buid = base_buid + index; 2572 + for (i = 0; i < n_dma; ++i) { 2573 + liobns[i] = SPAPR_PCI_LIOBN(index, i); 2574 + } 2575 + 2576 + phb_base = phb0_base + index * phb_spacing; 2577 + *pio = phb_base + pio_offset; 2578 + *mmio32 = phb_base + mmio_offset; 2579 + /* 2580 + * We don't set the 64-bit MMIO window, relying on the PHB's 2581 + * fallback behaviour of automatically splitting a large "32-bit" 2582 + * window into contiguous 32-bit and 64-bit windows 2583 + */ 2584 + } 2524 2585 2525 2586 static void spapr_machine_2_7_instance_options(MachineState *machine) 2526 2587 { ··· 2534 2595 spapr_machine_2_8_class_options(mc); 2535 2596 smc->tcg_default_cpu = "POWER7"; 2536 2597 SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_7); 2598 + smc->phb_placement = phb_placement_2_7; 2537 2599 } 2538 2600 2539 2601 DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
+3 -2
hw/ppc/spapr_pci.c
··· 1564 1564 DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1), 1565 1565 DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1), 1566 1566 DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size, 1567 - SPAPR_PCI_MMIO_WIN_SIZE), 1567 + SPAPR_PCI_MEM32_WIN_SIZE), 1568 1568 DEFINE_PROP_UINT64("mem64_win_addr", sPAPRPHBState, mem64_win_addr, -1), 1569 - DEFINE_PROP_UINT64("mem64_win_size", sPAPRPHBState, mem64_win_size, 0), 1569 + DEFINE_PROP_UINT64("mem64_win_size", sPAPRPHBState, mem64_win_size, 1570 + SPAPR_PCI_MEM64_WIN_SIZE), 1570 1571 DEFINE_PROP_UINT64("mem64_win_pciaddr", sPAPRPHBState, mem64_win_pciaddr, 1571 1572 -1), 1572 1573 DEFINE_PROP_UINT64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
+7 -1
include/hw/pci-host/spapr.h
··· 84 84 #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL 85 85 #define SPAPR_PCI_MEM32_WIN_SIZE \ 86 86 ((1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET) 87 + #define SPAPR_PCI_MEM64_WIN_SIZE 0x10000000000ULL /* 1 TiB */ 87 88 88 - #define SPAPR_PCI_MMIO_WIN_SIZE 0xf80000000 89 + /* Without manual configuration, all PCI outbound windows will be 90 + * within this range */ 91 + #define SPAPR_PCI_BASE (1ULL << 45) /* 32 TiB */ 92 + #define SPAPR_PCI_LIMIT (1ULL << 46) /* 64 TiB */ 93 + 94 + #define SPAPR_PCI_2_7_MMIO_WIN_SIZE 0xf80000000 89 95 #define SPAPR_PCI_IO_WIN_SIZE 0x10000 90 96 91 97 #define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL
+2 -1
tests/endianness-test.c
··· 38 38 { "ppc", "prep", 0x80000000, .bswap = true }, 39 39 { "ppc", "bamboo", 0xe8000000, .bswap = true, .superio = "i82378" }, 40 40 { "ppc64", "mac99", 0xf2000000, .bswap = true, .superio = "i82378" }, 41 - { "ppc64", "pseries", 0x10080000000ULL, 41 + { "ppc64", "pseries", (1ULL << 45), .bswap = true, .superio = "i82378" }, 42 + { "ppc64", "pseries-2.7", 0x10080000000ULL, 42 43 .bswap = true, .superio = "i82378" }, 43 44 { "sh4", "r2d", 0xfe240000, .superio = "i82378" }, 44 45 { "sh4eb", "r2d", 0xfe240000, .bswap = true, .superio = "i82378" },
+4 -5
tests/libqos/pci-spapr.c
··· 235 235 /* FIXME */ 236 236 } 237 237 238 - #define SPAPR_PCI_WINDOW_BASE 0x10000000000ULL 239 - #define SPAPR_PCI_MMIO32_WIN_OFF 0xA0000000 238 + #define SPAPR_PCI_BASE (1ULL << 45) 239 + 240 240 #define SPAPR_PCI_MMIO32_WIN_SIZE 0x80000000 /* 2 GiB */ 241 - #define SPAPR_PCI_IO_WIN_OFF 0x80000000 242 241 #define SPAPR_PCI_IO_WIN_SIZE 0x10000 243 242 244 243 QPCIBus *qpci_init_spapr(QGuestAllocator *alloc) ··· 273 272 * get the window locations */ 274 273 ret->buid = 0x800000020000000ULL; 275 274 276 - ret->pio_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_IO_WIN_OFF; 275 + ret->pio_cpu_base = SPAPR_PCI_BASE; 277 276 ret->pio.pci_base = 0; 278 277 ret->pio.size = SPAPR_PCI_IO_WIN_SIZE; 279 278 280 279 /* 32-bit portion of the MMIO window is at PCI address 2..4 GiB */ 281 - ret->mmio32_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_MMIO32_WIN_OFF; 280 + ret->mmio32_cpu_base = SPAPR_PCI_BASE + SPAPR_PCI_MMIO32_WIN_SIZE; 282 281 ret->mmio32.pci_base = 0x80000000; /* 2 GiB */ 283 282 ret->mmio32.size = SPAPR_PCI_MMIO32_WIN_SIZE; 284 283
+1 -1
tests/spapr-phb-test.c
··· 25 25 g_test_init(&argc, &argv, NULL); 26 26 qtest_add_func("/spapr-phb/device", test_phb_device); 27 27 28 - qtest_start("-device " TYPE_SPAPR_PCI_HOST_BRIDGE ",index=100"); 28 + qtest_start("-device " TYPE_SPAPR_PCI_HOST_BRIDGE ",index=30"); 29 29 30 30 ret = g_test_run(); 31 31