Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging

+18

MAINTAINERS

··· 1792 1792 F: hw/virtio/virtio-crypto-pci.c 1793 1793 F: include/hw/virtio/virtio-crypto.h 1794 1794 1795 + virtio-mem 1796 + M: David Hildenbrand <david@redhat.com> 1797 + S: Supported 1798 + W: https://virtio-mem.gitlab.io/ 1799 + F: hw/virtio/virtio-mem.c 1800 + F: hw/virtio/virtio-mem-pci.h 1801 + F: hw/virtio/virtio-mem-pci.c 1802 + F: include/hw/virtio/virtio-mem.h 1803 + 1795 1804 nvme 1796 1805 M: Keith Busch <kbusch@kernel.org> 1797 1806 L: qemu-block@nongnu.org ··· 2616 2625 F: tests/uefi-test-tools/ 2617 2626 F: .gitlab-ci.d/edk2.yml 2618 2627 F: .gitlab-ci.d/edk2/ 2628 + 2629 + VT-d Emulation 2630 + M: Michael S. Tsirkin <mst@redhat.com> 2631 + M: Peter Xu <peterx@redhat.com> 2632 + R: Jason Wang <jasowang@redhat.com> 2633 + S: Supported 2634 + F: hw/i386/intel_iommu.c 2635 + F: hw/i386/intel_iommu_internal.h 2636 + F: include/hw/i386/intel_iommu.h 2619 2637 2620 2638 Usermode Emulation 2621 2639 ------------------

+2 -2

accel/kvm/kvm-all.c

··· 40 40 #include "trace.h" 41 41 #include "hw/irq.h" 42 42 #include "sysemu/sev.h" 43 - #include "sysemu/balloon.h" 44 43 #include "qapi/visitor.h" 45 44 #include "qapi/qapi-types-common.h" 46 45 #include "qapi/qapi-visit-common.h" ··· 2229 2228 2230 2229 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2231 2230 if (!s->sync_mmu) { 2232 - qemu_balloon_inhibit(true); 2231 + ret = ram_block_discard_disable(true); 2232 + assert(!ret); 2233 2233 } 2234 2234 2235 2235 return 0;

-17

balloon.c

··· 36 36 static QEMUBalloonEvent *balloon_event_fn; 37 37 static QEMUBalloonStatus *balloon_stat_fn; 38 38 static void *balloon_opaque; 39 - static int balloon_inhibit_count; 40 - 41 - bool qemu_balloon_is_inhibited(void) 42 - { 43 - return atomic_read(&balloon_inhibit_count) > 0; 44 - } 45 - 46 - void qemu_balloon_inhibit(bool state) 47 - { 48 - if (state) { 49 - atomic_inc(&balloon_inhibit_count); 50 - } else { 51 - atomic_dec(&balloon_inhibit_count); 52 - } 53 - 54 - assert(atomic_read(&balloon_inhibit_count) >= 0); 55 - } 56 39 57 40 static bool have_balloon(Error **errp) 58 41 {

+21

configure

··· 1575 1575 ;; 1576 1576 --enable-vhost-user) vhost_user="yes" 1577 1577 ;; 1578 + --disable-vhost-vdpa) vhost_vdpa="no" 1579 + ;; 1580 + --enable-vhost-vdpa) vhost_vdpa="yes" 1581 + ;; 1578 1582 --disable-vhost-kernel) vhost_kernel="no" 1579 1583 ;; 1580 1584 --enable-vhost-kernel) vhost_kernel="yes" ··· 1883 1887 vhost-crypto vhost-user-crypto backend support 1884 1888 vhost-kernel vhost kernel backend support 1885 1889 vhost-user vhost-user backend support 1890 + vhost-vdpa vhost-vdpa kernel backend support 1886 1891 spice spice 1887 1892 rbd rados block device (rbd) 1888 1893 libiscsi iscsi support ··· 2394 2399 if test "$vhost_user" = "yes" && test "$mingw32" = "yes"; then 2395 2400 error_exit "vhost-user isn't available on win32" 2396 2401 fi 2402 + test "$vhost_vdpa" = "" && vhost_vdpa=$linux 2403 + if test "$vhost_vdpa" = "yes" && test "$linux" != "yes"; then 2404 + error_exit "vhost-vdpa is only available on Linux" 2405 + fi 2397 2406 test "$vhost_kernel" = "" && vhost_kernel=$linux 2398 2407 if test "$vhost_kernel" = "yes" && test "$linux" != "yes"; then 2399 2408 error_exit "vhost-kernel is only available on Linux" ··· 2421 2430 test "$vhost_user_fs" = "" && vhost_user_fs=$vhost_user 2422 2431 if test "$vhost_user_fs" = "yes" && test "$vhost_user" = "no"; then 2423 2432 error_exit "--enable-vhost-user-fs requires --enable-vhost-user" 2433 + fi 2434 + #vhost-vdpa backends 2435 + test "$vhost_net_vdpa" = "" && vhost_net_vdpa=$vhost_vdpa 2436 + if test "$vhost_net_vdpa" = "yes" && test "$vhost_vdpa" = "no"; then 2437 + error_exit "--enable-vhost-net-vdpa requires --enable-vhost-vdpa" 2424 2438 fi 2425 2439 2426 2440 # OR the vhost-kernel and vhost-user values for simplicity ··· 6947 6961 echo "vhost-vsock support $vhost_vsock" 6948 6962 echo "vhost-user support $vhost_user" 6949 6963 echo "vhost-user-fs support $vhost_user_fs" 6964 + echo "vhost-vdpa support $vhost_vdpa" 6950 6965 echo "Trace backends $trace_backends" 6951 6966 if have_backend "simple"; then 6952 6967 echo "Trace output file $trace_file-<pid>" ··· 7454 7469 if test "$vhost_net_user" = "yes" ; then 7455 7470 echo "CONFIG_VHOST_NET_USER=y" >> $config_host_mak 7456 7471 fi 7472 + if test "$vhost_net_vdpa" = "yes" ; then 7473 + echo "CONFIG_VHOST_NET_VDPA=y" >> $config_host_mak 7474 + fi 7457 7475 if test "$vhost_crypto" = "yes" ; then 7458 7476 echo "CONFIG_VHOST_CRYPTO=y" >> $config_host_mak 7459 7477 fi ··· 7468 7486 fi 7469 7487 if test "$vhost_user" = "yes" ; then 7470 7488 echo "CONFIG_VHOST_USER=y" >> $config_host_mak 7489 + fi 7490 + if test "$vhost_vdpa" = "yes" ; then 7491 + echo "CONFIG_VHOST_VDPA=y" >> $config_host_mak 7471 7492 fi 7472 7493 if test "$vhost_user_fs" = "yes" ; then 7473 7494 echo "CONFIG_VHOST_USER_FS=y" >> $config_host_mak

+1

docs/interop/index.rst

··· 20 20 qemu-ga 21 21 vhost-user 22 22 vhost-user-gpu 23 + vhost-vdpa

+24

docs/interop/vhost-user.rst

··· 816 816 #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13 817 817 #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 818 818 #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15 819 + #define VHOST_USER_PROTOCOL_F_STATUS 16 819 820 820 821 Master message types 821 822 -------------------- ··· 1306 1307 been successfully negotiated, along with the 1307 1308 ``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and 1308 1309 update the memory tables of the slave device. 1310 + 1311 + ``VHOST_USER_SET_STATUS`` 1312 + :id: 39 1313 + :equivalent ioctl: VHOST_VDPA_SET_STATUS 1314 + :slave payload: N/A 1315 + :master payload: ``u64`` 1316 + 1317 + When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been 1318 + successfully negotiated, this message is submitted by the master to 1319 + notify the backend with updated device status as defined in the Virtio 1320 + specification. 1321 + 1322 + ``VHOST_USER_GET_STATUS`` 1323 + :id: 40 1324 + :equivalent ioctl: VHOST_VDPA_GET_STATUS 1325 + :slave payload: ``u64`` 1326 + :master payload: N/A 1327 + 1328 + When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been 1329 + successfully negotiated, this message is submitted by the master to 1330 + query the backend for its device status as defined in the Virtio 1331 + specification. 1332 + 1309 1333 1310 1334 Slave message types 1311 1335 -------------------

+17

docs/interop/vhost-vdpa.rst

··· 1 + ===================== 2 + Vhost-vdpa Protocol 3 + ===================== 4 + 5 + Introduction 6 + ============= 7 + vDPA(Virtual data path acceleration) device is a device that uses 8 + a datapath which complies with the virtio specifications with vendor 9 + specific control path. vDPA devices can be both physically located on 10 + the hardware or emulated by software. 11 + 12 + This document describes the vDPA support in qemu 13 + 14 + Here is the kernel commit here 15 + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c8cf31885f69e86be0b5b9e6677a26797365e1d 16 + 17 + TODO : More information will add later

+52

exec.c

··· 4115 4115 } 4116 4116 } 4117 4117 4118 + /* 4119 + * If positive, discarding RAM is disabled. If negative, discarding RAM is 4120 + * required to work and cannot be disabled. 4121 + */ 4122 + static int ram_block_discard_disabled; 4123 + 4124 + int ram_block_discard_disable(bool state) 4125 + { 4126 + int old; 4127 + 4128 + if (!state) { 4129 + atomic_dec(&ram_block_discard_disabled); 4130 + return 0; 4131 + } 4132 + 4133 + do { 4134 + old = atomic_read(&ram_block_discard_disabled); 4135 + if (old < 0) { 4136 + return -EBUSY; 4137 + } 4138 + } while (atomic_cmpxchg(&ram_block_discard_disabled, old, old + 1) != old); 4139 + return 0; 4140 + } 4141 + 4142 + int ram_block_discard_require(bool state) 4143 + { 4144 + int old; 4145 + 4146 + if (!state) { 4147 + atomic_inc(&ram_block_discard_disabled); 4148 + return 0; 4149 + } 4150 + 4151 + do { 4152 + old = atomic_read(&ram_block_discard_disabled); 4153 + if (old > 0) { 4154 + return -EBUSY; 4155 + } 4156 + } while (atomic_cmpxchg(&ram_block_discard_disabled, old, old - 1) != old); 4157 + return 0; 4158 + } 4159 + 4160 + bool ram_block_discard_is_disabled(void) 4161 + { 4162 + return atomic_read(&ram_block_discard_disabled) > 0; 4163 + } 4164 + 4165 + bool ram_block_discard_is_required(void) 4166 + { 4167 + return atomic_read(&ram_block_discard_disabled) < 0; 4168 + } 4169 + 4118 4170 #endif

+2

hw/arm/virt.c

··· 2401 2401 hc->unplug = virt_machine_device_unplug_cb; 2402 2402 mc->nvdimm_supported = true; 2403 2403 mc->auto_enable_numa_with_memhp = true; 2404 + mc->auto_enable_numa_with_memdev = true; 2404 2405 mc->default_ram_id = "mach-virt.ram"; 2405 2406 2406 2407 object_class_property_add(oc, "acpi", "OnOffAuto", ··· 2516 2517 compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len); 2517 2518 mc->numa_mem_supported = true; 2518 2519 vmc->acpi_expose_flash = true; 2520 + mc->auto_enable_numa_with_memdev = false; 2519 2521 } 2520 2522 DEFINE_VIRT_MACHINE(5, 0) 2521 2523

+12 -5

hw/core/numa.c

··· 688 688 NodeInfo *numa_info = ms->numa_state->nodes; 689 689 690 690 /* 691 - * If memory hotplug is enabled (slots > 0) but without '-numa' 692 - * options explicitly on CLI, guestes will break. 691 + * If memory hotplug is enabled (slot > 0) or memory devices are enabled 692 + * (ms->maxram_size > ram_size) but without '-numa' options explicitly on 693 + * CLI, guests will break. 693 694 * 694 695 * Windows: won't enable memory hotplug without SRAT table at all 695 696 * ··· 704 705 * assume there is just one node with whole RAM. 705 706 */ 706 707 if (ms->numa_state->num_nodes == 0 && 707 - ((ms->ram_slots > 0 && 708 - mc->auto_enable_numa_with_memhp) || 709 - mc->auto_enable_numa)) { 708 + ((ms->ram_slots && mc->auto_enable_numa_with_memhp) || 709 + (ms->maxram_size > ms->ram_size && mc->auto_enable_numa_with_memdev) || 710 + mc->auto_enable_numa)) { 710 711 NumaNodeOptions node = { }; 711 712 parse_numa_node(ms, &node, &error_abort); 712 713 numa_info[0].node_mem = ram_size; ··· 824 825 MemoryDeviceInfoList *info; 825 826 PCDIMMDeviceInfo *pcdimm_info; 826 827 VirtioPMEMDeviceInfo *vpi; 828 + VirtioMEMDeviceInfo *vmi; 827 829 828 830 for (info = info_list; info; info = info->next) { 829 831 MemoryDeviceInfo *value = info->value; ··· 843 845 /* TODO: once we support numa, assign to right node */ 844 846 node_mem[0].node_mem += vpi->size; 845 847 node_mem[0].node_plugged_mem += vpi->size; 848 + break; 849 + case MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM: 850 + vmi = value->u.virtio_mem.data; 851 + node_mem[vmi->node].node_mem += vmi->size; 852 + node_mem[vmi->node].node_plugged_mem += vmi->size; 846 853 break; 847 854 default: 848 855 g_assert_not_reached();

+1

hw/i386/Kconfig

··· 35 35 select ACPI_PCI 36 36 select ACPI_VMGENID 37 37 select VIRTIO_PMEM_SUPPORTED 38 + select VIRTIO_MEM_SUPPORTED 38 39 39 40 config PC_PCI 40 41 bool

+1

hw/i386/microvm.c

··· 464 464 mc->max_cpus = 288; 465 465 mc->has_hotpluggable_cpus = false; 466 466 mc->auto_enable_numa_with_memhp = false; 467 + mc->auto_enable_numa_with_memdev = false; 467 468 mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE; 468 469 mc->nvdimm_supported = false; 469 470 mc->default_ram_id = "microvm.ram";

+38 -28

hw/i386/pc.c

··· 88 88 #include "hw/net/ne2000-isa.h" 89 89 #include "standard-headers/asm-x86/bootparam.h" 90 90 #include "hw/virtio/virtio-pmem-pci.h" 91 + #include "hw/virtio/virtio-mem-pci.h" 91 92 #include "hw/mem/memory-device.h" 92 93 #include "sysemu/replay.h" 93 94 #include "qapi/qmp/qerror.h" ··· 1637 1638 numa_cpu_pre_plug(cpu_slot, dev, errp); 1638 1639 } 1639 1640 1640 - static void pc_virtio_pmem_pci_pre_plug(HotplugHandler *hotplug_dev, 1641 - DeviceState *dev, Error **errp) 1641 + static void pc_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev, 1642 + DeviceState *dev, Error **errp) 1642 1643 { 1643 1644 HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev); 1644 1645 Error *local_err = NULL; 1645 1646 1646 - if (!hotplug_dev2) { 1647 + if (!hotplug_dev2 && dev->hotplugged) { 1647 1648 /* 1648 1649 * Without a bus hotplug handler, we cannot control the plug/unplug 1649 - * order. This should never be the case on x86, however better add 1650 - * a safety net. 1650 + * order. We should never reach this point when hotplugging on x86, 1651 + * however, better add a safety net. 1651 1652 */ 1652 - error_setg(errp, "virtio-pmem-pci not supported on this bus."); 1653 + error_setg(errp, "hotplug of virtio based memory devices not supported" 1654 + " on this bus."); 1653 1655 return; 1654 1656 } 1655 1657 /* ··· 1658 1660 */ 1659 1661 memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL, 1660 1662 &local_err); 1661 - if (!local_err) { 1663 + if (!local_err && hotplug_dev2) { 1662 1664 hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err); 1663 1665 } 1664 1666 error_propagate(errp, local_err); 1665 1667 } 1666 1668 1667 - static void pc_virtio_pmem_pci_plug(HotplugHandler *hotplug_dev, 1668 - DeviceState *dev, Error **errp) 1669 + static void pc_virtio_md_pci_plug(HotplugHandler *hotplug_dev, 1670 + DeviceState *dev, Error **errp) 1669 1671 { 1670 1672 HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev); 1671 1673 Error *local_err = NULL; ··· 1676 1678 * device bits. 1677 1679 */ 1678 1680 memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); 1679 - hotplug_handler_plug(hotplug_dev2, dev, &local_err); 1680 - if (local_err) { 1681 - memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); 1681 + if (hotplug_dev2) { 1682 + hotplug_handler_plug(hotplug_dev2, dev, &local_err); 1683 + if (local_err) { 1684 + memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev)); 1685 + } 1682 1686 } 1683 1687 error_propagate(errp, local_err); 1684 1688 } 1685 1689 1686 - static void pc_virtio_pmem_pci_unplug_request(HotplugHandler *hotplug_dev, 1687 - DeviceState *dev, Error **errp) 1690 + static void pc_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev, 1691 + DeviceState *dev, Error **errp) 1688 1692 { 1689 - /* We don't support virtio pmem hot unplug */ 1690 - error_setg(errp, "virtio pmem device unplug not supported."); 1693 + /* We don't support hot unplug of virtio based memory devices */ 1694 + error_setg(errp, "virtio based memory devices cannot be unplugged."); 1691 1695 } 1692 1696 1693 - static void pc_virtio_pmem_pci_unplug(HotplugHandler *hotplug_dev, 1694 - DeviceState *dev, Error **errp) 1697 + static void pc_virtio_md_pci_unplug(HotplugHandler *hotplug_dev, 1698 + DeviceState *dev, Error **errp) 1695 1699 { 1696 - /* We don't support virtio pmem hot unplug */ 1700 + /* We don't support hot unplug of virtio based memory devices */ 1697 1701 } 1698 1702 1699 1703 static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, ··· 1703 1707 pc_memory_pre_plug(hotplug_dev, dev, errp); 1704 1708 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { 1705 1709 pc_cpu_pre_plug(hotplug_dev, dev, errp); 1706 - } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) { 1707 - pc_virtio_pmem_pci_pre_plug(hotplug_dev, dev, errp); 1710 + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || 1711 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { 1712 + pc_virtio_md_pci_pre_plug(hotplug_dev, dev, errp); 1708 1713 } 1709 1714 } 1710 1715 ··· 1715 1720 pc_memory_plug(hotplug_dev, dev, errp); 1716 1721 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { 1717 1722 pc_cpu_plug(hotplug_dev, dev, errp); 1718 - } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) { 1719 - pc_virtio_pmem_pci_plug(hotplug_dev, dev, errp); 1723 + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || 1724 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { 1725 + pc_virtio_md_pci_plug(hotplug_dev, dev, errp); 1720 1726 } 1721 1727 } 1722 1728 ··· 1727 1733 pc_memory_unplug_request(hotplug_dev, dev, errp); 1728 1734 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { 1729 1735 pc_cpu_unplug_request_cb(hotplug_dev, dev, errp); 1730 - } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) { 1731 - pc_virtio_pmem_pci_unplug_request(hotplug_dev, dev, errp); 1736 + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || 1737 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { 1738 + pc_virtio_md_pci_unplug_request(hotplug_dev, dev, errp); 1732 1739 } else { 1733 1740 error_setg(errp, "acpi: device unplug request for not supported device" 1734 1741 " type: %s", object_get_typename(OBJECT(dev))); ··· 1742 1749 pc_memory_unplug(hotplug_dev, dev, errp); 1743 1750 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { 1744 1751 pc_cpu_unplug_cb(hotplug_dev, dev, errp); 1745 - } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) { 1746 - pc_virtio_pmem_pci_unplug(hotplug_dev, dev, errp); 1752 + } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || 1753 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { 1754 + pc_virtio_md_pci_unplug(hotplug_dev, dev, errp); 1747 1755 } else { 1748 1756 error_setg(errp, "acpi: device unplug for not supported device" 1749 1757 " type: %s", object_get_typename(OBJECT(dev))); ··· 1755 1763 { 1756 1764 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) || 1757 1765 object_dynamic_cast(OBJECT(dev), TYPE_CPU) || 1758 - object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) { 1766 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) || 1767 + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) { 1759 1768 return HOTPLUG_HANDLER(machine); 1760 1769 } 1761 1770 ··· 1966 1975 mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; 1967 1976 mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; 1968 1977 mc->auto_enable_numa_with_memhp = true; 1978 + mc->auto_enable_numa_with_memdev = true; 1969 1979 mc->has_hotpluggable_cpus = true; 1970 1980 mc->default_boot_order = "cad"; 1971 1981 mc->hot_add_cpu = pc_hot_add_cpu;

+1

hw/i386/pc_piix.c

··· 444 444 m->numa_mem_supported = true; 445 445 compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len); 446 446 compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len); 447 + m->auto_enable_numa_with_memdev = false; 447 448 } 448 449 449 450 DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL,

+1

hw/i386/pc_q35.c

··· 372 372 m->numa_mem_supported = true; 373 373 compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len); 374 374 compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len); 375 + m->auto_enable_numa_with_memhp = false; 375 376 } 376 377 377 378 DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL,

+11

hw/net/vhost_net-stub.c

··· 52 52 return features; 53 53 } 54 54 55 + int vhost_net_get_config(struct vhost_net *net, uint8_t *config, 56 + uint32_t config_len) 57 + { 58 + return 0; 59 + } 60 + int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, 61 + uint32_t offset, uint32_t size, uint32_t flags) 62 + { 63 + return 0; 64 + } 65 + 55 66 void vhost_net_ack_features(struct vhost_net *net, uint64_t features) 56 67 { 57 68 }

+32 -12

hw/net/vhost_net.c

··· 17 17 #include "net/net.h" 18 18 #include "net/tap.h" 19 19 #include "net/vhost-user.h" 20 + #include "net/vhost-vdpa.h" 20 21 21 22 #include "standard-headers/linux/vhost_types.h" 22 23 #include "hw/virtio/virtio-net.h" ··· 33 34 #include "hw/virtio/vhost.h" 34 35 #include "hw/virtio/virtio-bus.h" 35 36 36 - struct vhost_net { 37 - struct vhost_dev dev; 38 - struct vhost_virtqueue vqs[2]; 39 - int backend; 40 - NetClientState *nc; 41 - }; 42 37 43 38 /* Features supported by host kernel. */ 44 39 static const int kernel_feature_bits[] = { ··· 96 91 case NET_CLIENT_DRIVER_VHOST_USER: 97 92 feature_bits = user_feature_bits; 98 93 break; 94 + #ifdef CONFIG_VHOST_NET_VDPA 95 + case NET_CLIENT_DRIVER_VHOST_VDPA: 96 + feature_bits = vdpa_feature_bits; 97 + break; 98 + #endif 99 99 default: 100 100 error_report("Feature bits not defined for this type: %d", 101 101 net->nc->info->type); ··· 109 109 { 110 110 return vhost_get_features(&net->dev, vhost_net_get_feature_bits(net), 111 111 features); 112 + } 113 + int vhost_net_get_config(struct vhost_net *net, uint8_t *config, 114 + uint32_t config_len) 115 + { 116 + return vhost_dev_get_config(&net->dev, config, config_len); 117 + } 118 + int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, 119 + uint32_t offset, uint32_t size, uint32_t flags) 120 + { 121 + return vhost_dev_set_config(&net->dev, data, offset, size, flags); 112 122 } 113 123 114 124 void vhost_net_ack_features(struct vhost_net *net, uint64_t features) ··· 306 316 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev))); 307 317 VirtioBusState *vbus = VIRTIO_BUS(qbus); 308 318 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 319 + struct vhost_net *net; 309 320 int r, e, i; 321 + NetClientState *peer; 310 322 311 323 if (!k->set_guest_notifiers) { 312 324 error_report("binding does not support guest notifiers"); ··· 314 326 } 315 327 316 328 for (i = 0; i < total_queues; i++) { 317 - struct vhost_net *net; 318 329 319 - net = get_vhost_net(ncs[i].peer); 330 + peer = qemu_get_peer(ncs, i); 331 + net = get_vhost_net(peer); 320 332 vhost_net_set_vq_index(net, i * 2); 321 333 322 334 /* Suppress the masking guest notifiers on vhost user ··· 335 347 } 336 348 337 349 for (i = 0; i < total_queues; i++) { 338 - r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); 350 + peer = qemu_get_peer(ncs, i); 351 + r = vhost_net_start_one(get_vhost_net(peer), dev); 339 352 340 353 if (r < 0) { 341 354 goto err_start; 342 355 } 343 356 344 - if (ncs[i].peer->vring_enable) { 357 + if (peer->vring_enable) { 345 358 /* restore vring enable state */ 346 - r = vhost_set_vring_enable(ncs[i].peer, ncs[i].peer->vring_enable); 359 + r = vhost_set_vring_enable(peer, peer->vring_enable); 347 360 348 361 if (r < 0) { 349 362 goto err_start; ··· 355 368 356 369 err_start: 357 370 while (--i >= 0) { 358 - vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev); 371 + peer = qemu_get_peer(ncs , i); 372 + vhost_net_stop_one(get_vhost_net(peer), dev); 359 373 } 360 374 e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false); 361 375 if (e < 0) { ··· 427 441 #ifdef CONFIG_VHOST_NET_USER 428 442 case NET_CLIENT_DRIVER_VHOST_USER: 429 443 vhost_net = vhost_user_get_vhost_net(nc); 444 + assert(vhost_net); 445 + break; 446 + #endif 447 + #ifdef CONFIG_VHOST_NET_VDPA 448 + case NET_CLIENT_DRIVER_VHOST_VDPA: 449 + vhost_net = vhost_vdpa_get_vhost_net(nc); 430 450 assert(vhost_net); 431 451 break; 432 452 #endif

+19

hw/net/virtio-net.c

··· 43 43 #include "monitor/qdev.h" 44 44 #include "hw/pci/pci.h" 45 45 #include "net_rx_pkt.h" 46 + #include "hw/virtio/vhost.h" 46 47 47 48 #define VIRTIO_NET_VM_VERSION 11 48 49 ··· 125 126 VirtIONet *n = VIRTIO_NET(vdev); 126 127 struct virtio_net_config netcfg; 127 128 129 + int ret = 0; 130 + memset(&netcfg, 0 , sizeof(struct virtio_net_config)); 128 131 virtio_stw_p(vdev, &netcfg.status, n->status); 129 132 virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queues); 130 133 virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu); ··· 138 141 virtio_stl_p(vdev, &netcfg.supported_hash_types, 139 142 VIRTIO_NET_RSS_SUPPORTED_HASHES); 140 143 memcpy(config, &netcfg, n->config_size); 144 + 145 + NetClientState *nc = qemu_get_queue(n->nic); 146 + if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { 147 + ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg, 148 + n->config_size); 149 + if (ret != -1) { 150 + memcpy(config, &netcfg, n->config_size); 151 + } 152 + } 141 153 } 142 154 143 155 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) ··· 153 165 memcpy(n->mac, netcfg.mac, ETH_ALEN); 154 166 qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); 155 167 } 168 + 169 + NetClientState *nc = qemu_get_queue(n->nic); 170 + if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { 171 + vhost_net_set_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg, 172 + 0, n->config_size, 173 + VHOST_SET_CONFIG_TYPE_MASTER); 174 + } 156 175 } 157 176 158 177 static bool virtio_net_started(VirtIONet *n, uint8_t status)

+13 -9

hw/s390x/s390-virtio-ccw.c

··· 43 43 #include "hw/qdev-properties.h" 44 44 #include "hw/s390x/tod.h" 45 45 #include "sysemu/sysemu.h" 46 - #include "sysemu/balloon.h" 47 46 #include "hw/s390x/pv.h" 48 47 #include "migration/blocker.h" 49 48 ··· 329 328 ms->pv = false; 330 329 migrate_del_blocker(pv_mig_blocker); 331 330 error_free_or_abort(&pv_mig_blocker); 332 - qemu_balloon_inhibit(false); 331 + ram_block_discard_disable(false); 333 332 } 334 333 335 334 static int s390_machine_protect(S390CcwMachineState *ms) ··· 338 337 int rc; 339 338 340 339 /* 341 - * Ballooning on protected VMs needs support in the guest for 342 - * sharing and unsharing balloon pages. Block ballooning for 343 - * now, until we have a solution to make at least Linux guests 344 - * either support it or fail gracefully. 340 + * Discarding of memory in RAM blocks does not work as expected with 341 + * protected VMs. Sharing and unsharing pages would be required. Disable 342 + * it for now, until until we have a solution to make at least Linux 343 + * guests either support it (e.g., virtio-balloon) or fail gracefully. 345 344 */ 346 - qemu_balloon_inhibit(true); 345 + rc = ram_block_discard_disable(true); 346 + if (rc) { 347 + error_report("protected VMs: cannot disable RAM discard"); 348 + return rc; 349 + } 350 + 347 351 error_setg(&pv_mig_blocker, 348 352 "protected VMs are currently not migrateable."); 349 353 rc = migrate_add_blocker(pv_mig_blocker, &local_err); 350 354 if (rc) { 351 - qemu_balloon_inhibit(false); 355 + ram_block_discard_disable(false); 352 356 error_report_err(local_err); 353 357 error_free_or_abort(&pv_mig_blocker); 354 358 return rc; ··· 357 361 /* Create SE VM */ 358 362 rc = s390_pv_vm_enable(); 359 363 if (rc) { 360 - qemu_balloon_inhibit(false); 364 + ram_block_discard_disable(false); 361 365 migrate_del_blocker(pv_mig_blocker); 362 366 error_free_or_abort(&pv_mig_blocker); 363 367 return rc;

+4 -4

hw/vfio/ap.c

··· 105 105 vapdev->vdev.dev = dev; 106 106 107 107 /* 108 - * vfio-ap devices operate in a way compatible with 109 - * memory ballooning, as no pages are pinned in the host. 108 + * vfio-ap devices operate in a way compatible with discarding of 109 + * memory in RAM blocks, as no pages are pinned in the host. 110 110 * This needs to be set before vfio_get_device() for vfio common to 111 - * handle the balloon inhibitor. 111 + * handle ram_block_discard_disable(). 112 112 */ 113 - vapdev->vdev.balloon_allowed = true; 113 + vapdev->vdev.ram_block_discard_allowed = true; 114 114 115 115 ret = vfio_get_device(vfio_group, mdevid, &vapdev->vdev, errp); 116 116 if (ret) {

+6 -5

hw/vfio/ccw.c

··· 574 574 575 575 /* 576 576 * All vfio-ccw devices are believed to operate in a way compatible with 577 - * memory ballooning, ie. pages pinned in the host are in the current 578 - * working set of the guest driver and therefore never overlap with pages 579 - * available to the guest balloon driver. This needs to be set before 580 - * vfio_get_device() for vfio common to handle the balloon inhibitor. 577 + * discarding of memory in RAM blocks, ie. pages pinned in the host are 578 + * in the current working set of the guest driver and therefore never 579 + * overlap e.g., with pages available to the guest balloon driver. This 580 + * needs to be set before vfio_get_device() for vfio common to handle 581 + * ram_block_discard_disable(). 581 582 */ 582 - vcdev->vdev.balloon_allowed = true; 583 + vcdev->vdev.ram_block_discard_allowed = true; 583 584 584 585 if (vfio_get_device(group, vcdev->cdev.mdevid, &vcdev->vdev, errp)) { 585 586 goto out_err;

+29 -24

hw/vfio/common.c

··· 33 33 #include "qemu/error-report.h" 34 34 #include "qemu/main-loop.h" 35 35 #include "qemu/range.h" 36 - #include "sysemu/balloon.h" 37 36 #include "sysemu/kvm.h" 38 37 #include "sysemu/reset.h" 39 38 #include "trace.h" ··· 1215 1214 space = vfio_get_address_space(as); 1216 1215 1217 1216 /* 1218 - * VFIO is currently incompatible with memory ballooning insofar as the 1217 + * VFIO is currently incompatible with discarding of RAM insofar as the 1219 1218 * madvise to purge (zap) the page from QEMU's address space does not 1220 1219 * interact with the memory API and therefore leaves stale virtual to 1221 1220 * physical mappings in the IOMMU if the page was previously pinned. We 1222 - * therefore add a balloon inhibit for each group added to a container, 1221 + * therefore set discarding broken for each group added to a container, 1223 1222 * whether the container is used individually or shared. This provides 1224 1223 * us with options to allow devices within a group to opt-in and allow 1225 - * ballooning, so long as it is done consistently for a group (for instance 1224 + * discarding, so long as it is done consistently for a group (for instance 1226 1225 * if the device is an mdev device where it is known that the host vendor 1227 1226 * driver will never pin pages outside of the working set of the guest 1228 - * driver, which would thus not be ballooning candidates). 1227 + * driver, which would thus not be discarding candidates). 1229 1228 * 1230 1229 * The first opportunity to induce pinning occurs here where we attempt to 1231 1230 * attach the group to existing containers within the AddressSpace. If any 1232 - * pages are already zapped from the virtual address space, such as from a 1233 - * previous ballooning opt-in, new pinning will cause valid mappings to be 1231 + * pages are already zapped from the virtual address space, such as from 1232 + * previous discards, new pinning will cause valid mappings to be 1234 1233 * re-established. Likewise, when the overall MemoryListener for a new 1235 1234 * container is registered, a replay of mappings within the AddressSpace 1236 1235 * will occur, re-establishing any previously zapped pages as well. 1237 1236 * 1238 - * NB. Balloon inhibiting does not currently block operation of the 1239 - * balloon driver or revoke previously pinned pages, it only prevents 1240 - * calling madvise to modify the virtual mapping of ballooned pages. 1237 + * Especially virtio-balloon is currently only prevented from discarding 1238 + * new memory, it will not yet set ram_block_discard_set_required() and 1239 + * therefore, neither stops us here or deals with the sudden memory 1240 + * consumption of inflated memory. 1241 1241 */ 1242 - qemu_balloon_inhibit(true); 1242 + ret = ram_block_discard_disable(true); 1243 + if (ret) { 1244 + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); 1245 + return ret; 1246 + } 1243 1247 1244 1248 QLIST_FOREACH(container, &space->containers, next) { 1245 1249 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { ··· 1405 1409 close(fd); 1406 1410 1407 1411 put_space_exit: 1408 - qemu_balloon_inhibit(false); 1412 + ram_block_discard_disable(false); 1409 1413 vfio_put_address_space(space); 1410 1414 1411 1415 return ret; ··· 1526 1530 return; 1527 1531 } 1528 1532 1529 - if (!group->balloon_allowed) { 1530 - qemu_balloon_inhibit(false); 1533 + if (!group->ram_block_discard_allowed) { 1534 + ram_block_discard_disable(false); 1531 1535 } 1532 1536 vfio_kvm_device_del_group(group); 1533 1537 vfio_disconnect_container(group); ··· 1565 1569 } 1566 1570 1567 1571 /* 1568 - * Clear the balloon inhibitor for this group if the driver knows the 1569 - * device operates compatibly with ballooning. Setting must be consistent 1570 - * per group, but since compatibility is really only possible with mdev 1571 - * currently, we expect singleton groups. 1572 + * Set discarding of RAM as not broken for this group if the driver knows 1573 + * the device operates compatibly with discarding. Setting must be 1574 + * consistent per group, but since compatibility is really only possible 1575 + * with mdev currently, we expect singleton groups. 1572 1576 */ 1573 - if (vbasedev->balloon_allowed != group->balloon_allowed) { 1577 + if (vbasedev->ram_block_discard_allowed != 1578 + group->ram_block_discard_allowed) { 1574 1579 if (!QLIST_EMPTY(&group->device_list)) { 1575 - error_setg(errp, 1576 - "Inconsistent device balloon setting within group"); 1580 + error_setg(errp, "Inconsistent setting of support for discarding " 1581 + "RAM (e.g., balloon) within group"); 1577 1582 close(fd); 1578 1583 return -1; 1579 1584 } 1580 1585 1581 - if (!group->balloon_allowed) { 1582 - group->balloon_allowed = true; 1583 - qemu_balloon_inhibit(false); 1586 + if (!group->ram_block_discard_allowed) { 1587 + group->ram_block_discard_allowed = true; 1588 + ram_block_discard_disable(false); 1584 1589 } 1585 1590 } 1586 1591

+3 -3

hw/vfio/pci.c

··· 2789 2789 } 2790 2790 2791 2791 /* 2792 - * Mediated devices *might* operate compatibly with memory ballooning, but 2792 + * Mediated devices *might* operate compatibly with discarding of RAM, but 2793 2793 * we cannot know for certain, it depends on whether the mdev vendor driver 2794 2794 * stays in sync with the active working set of the guest driver. Prevent 2795 2795 * the x-balloon-allowed option unless this is minimally an mdev device. ··· 2802 2802 2803 2803 trace_vfio_mdev(vdev->vbasedev.name, is_mdev); 2804 2804 2805 - if (vdev->vbasedev.balloon_allowed && !is_mdev) { 2805 + if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) { 2806 2806 error_setg(errp, "x-balloon-allowed only potentially compatible " 2807 2807 "with mdev devices"); 2808 2808 vfio_put_group(group); ··· 3156 3156 VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), 3157 3157 DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), 3158 3158 DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, 3159 - vbasedev.balloon_allowed, false), 3159 + vbasedev.ram_block_discard_allowed, false), 3160 3160 DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), 3161 3161 DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), 3162 3162 DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),

+11

hw/virtio/Kconfig

··· 47 47 depends on VIRTIO 48 48 depends on VIRTIO_PMEM_SUPPORTED 49 49 select MEM_DEVICE 50 + 51 + config VIRTIO_MEM_SUPPORTED 52 + bool 53 + 54 + config VIRTIO_MEM 55 + bool 56 + default y 57 + depends on VIRTIO 58 + depends on LINUX 59 + depends on VIRTIO_MEM_SUPPORTED 60 + select MEM_DEVICE

+3

hw/virtio/Makefile.objs

··· 5 5 obj-$(CONFIG_VHOST) += vhost.o vhost-backend.o 6 6 common-obj-$(call lnot,$(CONFIG_VHOST)) += vhost-stub.o 7 7 obj-$(CONFIG_VHOST_USER) += vhost-user.o 8 + obj-$(CONFIG_VHOST_VDPA) += vhost-vdpa.o 8 9 9 10 common-obj-$(CONFIG_VIRTIO_RNG) += virtio-rng.o 10 11 common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o ··· 19 20 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o 20 21 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o 21 22 obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o 23 + obj-$(CONFIG_VIRTIO_MEM) += virtio-mem.o 24 + common-obj-$(call land,$(CONFIG_VIRTIO_MEM),$(CONFIG_VIRTIO_PCI)) += virtio-mem-pci.o 22 25 23 26 ifeq ($(CONFIG_VIRTIO_PCI),y) 24 27 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o

+10

hw/virtio/trace-events

··· 75 75 virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d" 76 76 virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64 77 77 virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t start, uint64_t end) "dev= %d, type=%d start=0x%"PRIx64" end=0x%"PRIx64 78 + 79 + # virtio-mem.c 80 + virtio_mem_send_response(uint16_t type) "type=%" PRIu16 81 + virtio_mem_plug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 82 + virtio_mem_unplug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 83 + virtio_mem_unplugged_all(void) "" 84 + virtio_mem_unplug_all_request(void) "" 85 + virtio_mem_resized_usable_region(uint64_t old_size, uint64_t new_size) "old_size=0x%" PRIx64 "new_size=0x%" PRIx64 86 + virtio_mem_state_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 87 + virtio_mem_state_response(uint16_t state) "state=%" PRIu16

+6

hw/virtio/vhost-backend.c

··· 15 15 #include "qemu/main-loop.h" 16 16 #include "standard-headers/linux/vhost_types.h" 17 17 18 + #include "hw/virtio/vhost-vdpa.h" 18 19 #ifdef CONFIG_VHOST_KERNEL 19 20 #include <linux/vhost.h> 20 21 #include <sys/ioctl.h> ··· 284 285 #ifdef CONFIG_VHOST_USER 285 286 case VHOST_BACKEND_TYPE_USER: 286 287 dev->vhost_ops = &user_ops; 288 + break; 289 + #endif 290 + #ifdef CONFIG_VHOST_VDPA 291 + case VHOST_BACKEND_TYPE_VDPA: 292 + dev->vhost_ops = &vdpa_ops; 287 293 break; 288 294 #endif 289 295 default:

+475

hw/virtio/vhost-vdpa.c

··· 1 + /* 2 + * vhost-vdpa 3 + * 4 + * Copyright(c) 2017-2018 Intel Corporation. 5 + * Copyright(c) 2020 Red Hat, Inc. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 + * See the COPYING file in the top-level directory. 9 + * 10 + */ 11 + 12 + #include "qemu/osdep.h" 13 + #include <linux/vhost.h> 14 + #include <linux/vfio.h> 15 + #include <sys/eventfd.h> 16 + #include <sys/ioctl.h> 17 + #include "hw/virtio/vhost.h" 18 + #include "hw/virtio/vhost-backend.h" 19 + #include "hw/virtio/virtio-net.h" 20 + #include "hw/virtio/vhost-vdpa.h" 21 + #include "qemu/main-loop.h" 22 + #include <linux/kvm.h> 23 + #include "sysemu/kvm.h" 24 + 25 + static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section) 26 + { 27 + return (!memory_region_is_ram(section->mr) && 28 + !memory_region_is_iommu(section->mr)) || 29 + /* 30 + * Sizing an enabled 64-bit BAR can cause spurious mappings to 31 + * addresses in the upper part of the 64-bit address space. These 32 + * are never accessed by the CPU and beyond the address width of 33 + * some IOMMU hardware. TODO: VDPA should tell us the IOMMU width. 34 + */ 35 + section->offset_within_address_space & (1ULL << 63); 36 + } 37 + 38 + static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 39 + void *vaddr, bool readonly) 40 + { 41 + struct vhost_msg_v2 msg; 42 + int fd = v->device_fd; 43 + int ret = 0; 44 + 45 + msg.type = v->msg_type; 46 + msg.iotlb.iova = iova; 47 + msg.iotlb.size = size; 48 + msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 49 + msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 50 + msg.iotlb.type = VHOST_IOTLB_UPDATE; 51 + 52 + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 53 + error_report("failed to write, fd=%d, errno=%d (%s)", 54 + fd, errno, strerror(errno)); 55 + return -EIO ; 56 + } 57 + 58 + return ret; 59 + } 60 + 61 + static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, 62 + hwaddr size) 63 + { 64 + struct vhost_msg_v2 msg; 65 + int fd = v->device_fd; 66 + int ret = 0; 67 + 68 + msg.type = v->msg_type; 69 + msg.iotlb.iova = iova; 70 + msg.iotlb.size = size; 71 + msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 72 + 73 + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 74 + error_report("failed to write, fd=%d, errno=%d (%s)", 75 + fd, errno, strerror(errno)); 76 + return -EIO ; 77 + } 78 + 79 + return ret; 80 + } 81 + 82 + static void vhost_vdpa_listener_region_add(MemoryListener *listener, 83 + MemoryRegionSection *section) 84 + { 85 + struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 86 + hwaddr iova; 87 + Int128 llend, llsize; 88 + void *vaddr; 89 + int ret; 90 + 91 + if (vhost_vdpa_listener_skipped_section(section)) { 92 + return; 93 + } 94 + 95 + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 96 + (section->offset_within_region & ~TARGET_PAGE_MASK))) { 97 + error_report("%s received unaligned region", __func__); 98 + return; 99 + } 100 + 101 + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 102 + llend = int128_make64(section->offset_within_address_space); 103 + llend = int128_add(llend, section->size); 104 + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 105 + 106 + if (int128_ge(int128_make64(iova), llend)) { 107 + return; 108 + } 109 + 110 + memory_region_ref(section->mr); 111 + 112 + /* Here we assume that memory_region_is_ram(section->mr)==true */ 113 + 114 + vaddr = memory_region_get_ram_ptr(section->mr) + 115 + section->offset_within_region + 116 + (iova - section->offset_within_address_space); 117 + 118 + llsize = int128_sub(llend, int128_make64(iova)); 119 + 120 + ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 121 + vaddr, section->readonly); 122 + if (ret) { 123 + error_report("vhost vdpa map fail!"); 124 + if (memory_region_is_ram_device(section->mr)) { 125 + /* Allow unexpected mappings not to be fatal for RAM devices */ 126 + error_report("map ram fail!"); 127 + return ; 128 + } 129 + goto fail; 130 + } 131 + 132 + return; 133 + 134 + fail: 135 + if (memory_region_is_ram_device(section->mr)) { 136 + error_report("failed to vdpa_dma_map. pci p2p may not work"); 137 + return; 138 + 139 + } 140 + /* 141 + * On the initfn path, store the first error in the container so we 142 + * can gracefully fail. Runtime, there's not much we can do other 143 + * than throw a hardware error. 144 + */ 145 + error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 146 + return; 147 + 148 + } 149 + 150 + static void vhost_vdpa_listener_region_del(MemoryListener *listener, 151 + MemoryRegionSection *section) 152 + { 153 + struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 154 + hwaddr iova; 155 + Int128 llend, llsize; 156 + int ret; 157 + bool try_unmap = true; 158 + 159 + if (vhost_vdpa_listener_skipped_section(section)) { 160 + return; 161 + } 162 + 163 + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 164 + (section->offset_within_region & ~TARGET_PAGE_MASK))) { 165 + error_report("%s received unaligned region", __func__); 166 + return; 167 + } 168 + 169 + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 170 + llend = int128_make64(section->offset_within_address_space); 171 + llend = int128_add(llend, section->size); 172 + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 173 + 174 + if (int128_ge(int128_make64(iova), llend)) { 175 + return; 176 + } 177 + 178 + llsize = int128_sub(llend, int128_make64(iova)); 179 + 180 + if (try_unmap) { 181 + ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 182 + if (ret) { 183 + error_report("vhost_vdpa dma unmap error!"); 184 + } 185 + } 186 + 187 + memory_region_unref(section->mr); 188 + } 189 + /* 190 + * IOTLB API is used by vhost-vpda which requires incremental updating 191 + * of the mapping. So we can not use generic vhost memory listener which 192 + * depends on the addnop(). 193 + */ 194 + static const MemoryListener vhost_vdpa_memory_listener = { 195 + .region_add = vhost_vdpa_listener_region_add, 196 + .region_del = vhost_vdpa_listener_region_del, 197 + }; 198 + 199 + static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 200 + void *arg) 201 + { 202 + struct vhost_vdpa *v = dev->opaque; 203 + int fd = v->device_fd; 204 + 205 + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 206 + 207 + return ioctl(fd, request, arg); 208 + } 209 + 210 + static void vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 211 + { 212 + uint8_t s; 213 + 214 + if (vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s)) { 215 + return; 216 + } 217 + 218 + s |= status; 219 + 220 + vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 221 + } 222 + 223 + static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque) 224 + { 225 + struct vhost_vdpa *v; 226 + uint64_t features; 227 + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 228 + 229 + v = opaque; 230 + dev->opaque = opaque ; 231 + vhost_vdpa_call(dev, VHOST_GET_FEATURES, &features); 232 + dev->backend_features = features; 233 + v->listener = vhost_vdpa_memory_listener; 234 + v->msg_type = VHOST_IOTLB_MSG_V2; 235 + 236 + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 237 + VIRTIO_CONFIG_S_DRIVER); 238 + 239 + return 0; 240 + } 241 + 242 + static int vhost_vdpa_cleanup(struct vhost_dev *dev) 243 + { 244 + struct vhost_vdpa *v; 245 + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 246 + v = dev->opaque; 247 + memory_listener_unregister(&v->listener); 248 + 249 + dev->opaque = NULL; 250 + return 0; 251 + } 252 + 253 + static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 254 + { 255 + return INT_MAX; 256 + } 257 + 258 + static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 259 + struct vhost_memory *mem) 260 + { 261 + 262 + if (mem->padding) { 263 + return -1; 264 + } 265 + 266 + return 0; 267 + } 268 + 269 + static int vhost_vdpa_set_features(struct vhost_dev *dev, 270 + uint64_t features) 271 + { 272 + int ret; 273 + ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 274 + uint8_t status = 0; 275 + if (ret) { 276 + return ret; 277 + } 278 + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 279 + vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &status); 280 + 281 + return !(status & VIRTIO_CONFIG_S_FEATURES_OK); 282 + } 283 + 284 + int vhost_vdpa_get_device_id(struct vhost_dev *dev, 285 + uint32_t *device_id) 286 + { 287 + return vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 288 + } 289 + 290 + static int vhost_vdpa_reset_device(struct vhost_dev *dev) 291 + { 292 + uint8_t status = 0; 293 + 294 + return vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 295 + } 296 + 297 + static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 298 + { 299 + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 300 + 301 + return idx - dev->vq_index; 302 + } 303 + 304 + static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 305 + { 306 + int i; 307 + for (i = 0; i < dev->nvqs; ++i) { 308 + struct vhost_vring_state state = { 309 + .index = dev->vq_index + i, 310 + .num = 1, 311 + }; 312 + vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 313 + } 314 + return 0; 315 + } 316 + 317 + static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 318 + uint32_t offset, uint32_t size, 319 + uint32_t flags) 320 + { 321 + struct vhost_vdpa_config *config; 322 + int ret; 323 + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 324 + config = g_malloc(size + config_size); 325 + if (config == NULL) { 326 + return -1; 327 + } 328 + config->off = offset; 329 + config->len = size; 330 + memcpy(config->buf, data, size); 331 + ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 332 + g_free(config); 333 + return ret; 334 + } 335 + 336 + static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 337 + uint32_t config_len) 338 + { 339 + struct vhost_vdpa_config *v_config; 340 + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 341 + int ret; 342 + 343 + v_config = g_malloc(config_len + config_size); 344 + if (v_config == NULL) { 345 + return -1; 346 + } 347 + v_config->len = config_len; 348 + v_config->off = 0; 349 + ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 350 + memcpy(config, v_config->buf, config_len); 351 + g_free(v_config); 352 + return ret; 353 + } 354 + 355 + static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 356 + { 357 + struct vhost_vdpa *v = dev->opaque; 358 + if (started) { 359 + uint8_t status = 0; 360 + memory_listener_register(&v->listener, &address_space_memory); 361 + vhost_vdpa_set_vring_ready(dev); 362 + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 363 + vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &status); 364 + 365 + return !(status & VIRTIO_CONFIG_S_DRIVER_OK); 366 + } else { 367 + vhost_vdpa_reset_device(dev); 368 + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 369 + VIRTIO_CONFIG_S_DRIVER); 370 + memory_listener_unregister(&v->listener); 371 + 372 + return 0; 373 + } 374 + } 375 + 376 + static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 377 + struct vhost_log *log) 378 + { 379 + return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 380 + } 381 + 382 + static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 383 + struct vhost_vring_addr *addr) 384 + { 385 + return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 386 + } 387 + 388 + static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 389 + struct vhost_vring_state *ring) 390 + { 391 + return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 392 + } 393 + 394 + static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 395 + struct vhost_vring_state *ring) 396 + { 397 + return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 398 + } 399 + 400 + static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 401 + struct vhost_vring_state *ring) 402 + { 403 + return vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 404 + } 405 + 406 + static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 407 + struct vhost_vring_file *file) 408 + { 409 + return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 410 + } 411 + 412 + static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 413 + struct vhost_vring_file *file) 414 + { 415 + return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 416 + } 417 + 418 + static int vhost_vdpa_get_features(struct vhost_dev *dev, 419 + uint64_t *features) 420 + { 421 + return vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 422 + } 423 + 424 + static int vhost_vdpa_set_owner(struct vhost_dev *dev) 425 + { 426 + return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 427 + } 428 + 429 + static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 430 + struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 431 + { 432 + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 433 + addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 434 + addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 435 + addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 436 + return 0; 437 + } 438 + 439 + static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 440 + { 441 + return true; 442 + } 443 + 444 + const VhostOps vdpa_ops = { 445 + .backend_type = VHOST_BACKEND_TYPE_VDPA, 446 + .vhost_backend_init = vhost_vdpa_init, 447 + .vhost_backend_cleanup = vhost_vdpa_cleanup, 448 + .vhost_set_log_base = vhost_vdpa_set_log_base, 449 + .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 450 + .vhost_set_vring_num = vhost_vdpa_set_vring_num, 451 + .vhost_set_vring_base = vhost_vdpa_set_vring_base, 452 + .vhost_get_vring_base = vhost_vdpa_get_vring_base, 453 + .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 454 + .vhost_set_vring_call = vhost_vdpa_set_vring_call, 455 + .vhost_get_features = vhost_vdpa_get_features, 456 + .vhost_set_owner = vhost_vdpa_set_owner, 457 + .vhost_set_vring_endian = NULL, 458 + .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 459 + .vhost_set_mem_table = vhost_vdpa_set_mem_table, 460 + .vhost_set_features = vhost_vdpa_set_features, 461 + .vhost_reset_device = vhost_vdpa_reset_device, 462 + .vhost_get_vq_index = vhost_vdpa_get_vq_index, 463 + .vhost_get_config = vhost_vdpa_get_config, 464 + .vhost_set_config = vhost_vdpa_set_config, 465 + .vhost_requires_shm_log = NULL, 466 + .vhost_migration_done = NULL, 467 + .vhost_backend_can_merge = NULL, 468 + .vhost_net_set_mtu = NULL, 469 + .vhost_set_iotlb_callback = NULL, 470 + .vhost_send_device_iotlb_msg = NULL, 471 + .vhost_dev_start = vhost_vdpa_dev_start, 472 + .vhost_get_device_id = vhost_vdpa_get_device_id, 473 + .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 474 + .vhost_force_iommu = vhost_vdpa_force_iommu, 475 + };

+39 -13

hw/virtio/vhost.c

··· 773 773 struct vhost_virtqueue *vq, 774 774 unsigned idx, bool enable_log) 775 775 { 776 - struct vhost_vring_addr addr = { 777 - .index = idx, 778 - .desc_user_addr = (uint64_t)(unsigned long)vq->desc, 779 - .avail_user_addr = (uint64_t)(unsigned long)vq->avail, 780 - .used_user_addr = (uint64_t)(unsigned long)vq->used, 781 - .log_guest_addr = vq->used_phys, 782 - .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, 783 - }; 784 - int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 776 + struct vhost_vring_addr addr; 777 + int r; 778 + memset(&addr, 0, sizeof(struct vhost_vring_addr)); 779 + 780 + if (dev->vhost_ops->vhost_vq_get_addr) { 781 + r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); 782 + if (r < 0) { 783 + VHOST_OPS_DEBUG("vhost_vq_get_addr failed"); 784 + return -errno; 785 + } 786 + } else { 787 + addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; 788 + addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; 789 + addr.used_user_addr = (uint64_t)(unsigned long)vq->used; 790 + } 791 + addr.index = idx; 792 + addr.log_guest_addr = vq->used_phys; 793 + addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; 794 + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 785 795 if (r < 0) { 786 796 VHOST_OPS_DEBUG("vhost_set_vring_addr failed"); 787 797 return -errno; ··· 799 809 } 800 810 if (!vhost_dev_has_iommu(dev)) { 801 811 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); 812 + } 813 + if (dev->vhost_ops->vhost_force_iommu) { 814 + if (dev->vhost_ops->vhost_force_iommu(dev) == true) { 815 + features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; 816 + } 802 817 } 803 818 r = dev->vhost_ops->vhost_set_features(dev, features); 804 819 if (r < 0) { ··· 1685 1700 goto fail_log; 1686 1701 } 1687 1702 } 1688 - 1689 - if (vhost_dev_has_iommu(hdev)) { 1690 - hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 1703 + if (hdev->vhost_ops->vhost_dev_start) { 1704 + r = hdev->vhost_ops->vhost_dev_start(hdev, true); 1705 + if (r) { 1706 + goto fail_log; 1707 + } 1708 + } 1709 + if (vhost_dev_has_iommu(hdev) && 1710 + hdev->vhost_ops->vhost_set_iotlb_callback) { 1711 + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 1691 1712 1692 1713 /* Update used ring information for IOTLB to work correctly, 1693 1714 * vhost-kernel code requires for this.*/ ··· 1722 1743 /* should only be called after backend is connected */ 1723 1744 assert(hdev->vhost_ops); 1724 1745 1746 + if (hdev->vhost_ops->vhost_dev_start) { 1747 + hdev->vhost_ops->vhost_dev_start(hdev, false); 1748 + } 1725 1749 for (i = 0; i < hdev->nvqs; ++i) { 1726 1750 vhost_virtqueue_stop(hdev, 1727 1751 vdev, ··· 1730 1754 } 1731 1755 1732 1756 if (vhost_dev_has_iommu(hdev)) { 1733 - hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 1757 + if (hdev->vhost_ops->vhost_set_iotlb_callback) { 1758 + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 1759 + } 1734 1760 memory_listener_unregister(&hdev->iommu_listener); 1735 1761 } 1736 1762 vhost_log_put(hdev, true);

+28 -8

hw/virtio/virtio-balloon.c

··· 63 63 return pbp->base_gpa == base_gpa; 64 64 } 65 65 66 + static bool virtio_balloon_inhibited(void) 67 + { 68 + /* Postcopy cannot deal with concurrent discards, so it's special. */ 69 + return ram_block_discard_is_disabled() || migration_in_incoming_postcopy(); 70 + } 71 + 66 72 static void balloon_inflate_page(VirtIOBalloon *balloon, 67 73 MemoryRegion *mr, hwaddr mr_offset, 68 74 PartiallyBalloonedPage *pbp) ··· 336 342 * accessible by another device or process, or if the guest is 337 343 * expecting it to retain a non-zero value. 338 344 */ 339 - if (qemu_balloon_is_inhibited() || dev->poison_val) { 345 + if (virtio_balloon_inhibited() || dev->poison_val) { 340 346 goto skip_element; 341 347 } 342 348 ··· 421 427 422 428 trace_virtio_balloon_handle_output(memory_region_name(section.mr), 423 429 pa); 424 - if (!qemu_balloon_is_inhibited()) { 430 + if (!virtio_balloon_inhibited()) { 425 431 if (vq == s->ivq) { 426 432 balloon_inflate_page(s, section.mr, 427 433 section.offset_within_region, &pbp); ··· 628 634 { 629 635 VirtIODevice *vdev = VIRTIO_DEVICE(s); 630 636 631 - s->free_page_report_status = FREE_PAGE_REPORT_S_DONE; 632 - virtio_notify_config(vdev); 637 + if (s->free_page_report_status != FREE_PAGE_REPORT_S_DONE) { 638 + /* See virtio_balloon_free_page_stop() */ 639 + qemu_mutex_lock(&s->free_page_lock); 640 + s->free_page_report_status = FREE_PAGE_REPORT_S_DONE; 641 + qemu_mutex_unlock(&s->free_page_lock); 642 + virtio_notify_config(vdev); 643 + } 633 644 } 634 645 635 646 static int ··· 653 664 case PRECOPY_NOTIFY_SETUP: 654 665 precopy_enable_free_page_optimization(); 655 666 break; 656 - case PRECOPY_NOTIFY_COMPLETE: 657 - case PRECOPY_NOTIFY_CLEANUP: 658 667 case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC: 659 668 virtio_balloon_free_page_stop(dev); 660 669 break; 661 670 case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC: 662 671 if (vdev->vm_running) { 663 672 virtio_balloon_free_page_start(dev); 664 - } else { 665 - virtio_balloon_free_page_done(dev); 673 + break; 666 674 } 675 + /* 676 + * Set S_DONE before migrating the vmstate, so the guest will reuse 677 + * all hinted pages once running on the destination. Fall through. 678 + */ 679 + case PRECOPY_NOTIFY_CLEANUP: 680 + /* 681 + * Especially, if something goes wrong during precopy or if migration 682 + * is canceled, we have to properly communicate S_DONE to the VM. 683 + */ 684 + virtio_balloon_free_page_done(dev); 685 + break; 686 + case PRECOPY_NOTIFY_COMPLETE: 667 687 break; 668 688 default: 669 689 virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason);

+157

hw/virtio/virtio-mem-pci.c

··· 1 + /* 2 + * Virtio MEM PCI device 3 + * 4 + * Copyright (C) 2020 Red Hat, Inc. 5 + * 6 + * Authors: 7 + * David Hildenbrand <david@redhat.com> 8 + * 9 + * This work is licensed under the terms of the GNU GPL, version 2. 10 + * See the COPYING file in the top-level directory. 11 + */ 12 + 13 + #include "qemu/osdep.h" 14 + #include "virtio-mem-pci.h" 15 + #include "hw/mem/memory-device.h" 16 + #include "qapi/error.h" 17 + #include "qapi/qapi-events-misc.h" 18 + 19 + static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) 20 + { 21 + VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev); 22 + DeviceState *vdev = DEVICE(&mem_pci->vdev); 23 + 24 + qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); 25 + object_property_set_bool(OBJECT(vdev), true, "realized", errp); 26 + } 27 + 28 + static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr, 29 + Error **errp) 30 + { 31 + object_property_set_uint(OBJECT(md), addr, VIRTIO_MEM_ADDR_PROP, errp); 32 + } 33 + 34 + static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md) 35 + { 36 + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, 37 + &error_abort); 38 + } 39 + 40 + static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md, 41 + Error **errp) 42 + { 43 + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); 44 + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); 45 + VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); 46 + 47 + return vmc->get_memory_region(vmem, errp); 48 + } 49 + 50 + static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md, 51 + Error **errp) 52 + { 53 + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP, 54 + errp); 55 + } 56 + 57 + static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md, 58 + MemoryDeviceInfo *info) 59 + { 60 + VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1); 61 + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); 62 + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); 63 + VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem); 64 + DeviceState *dev = DEVICE(md); 65 + 66 + if (dev->id) { 67 + vi->has_id = true; 68 + vi->id = g_strdup(dev->id); 69 + } 70 + 71 + /* let the real device handle everything else */ 72 + vpc->fill_device_info(vmem, vi); 73 + 74 + info->u.virtio_mem.data = vi; 75 + info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM; 76 + } 77 + 78 + static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data) 79 + { 80 + VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI, 81 + size_change_notifier); 82 + DeviceState *dev = DEVICE(pci_mem); 83 + const uint64_t * const size_p = data; 84 + const char *id = NULL; 85 + 86 + if (dev->id) { 87 + id = g_strdup(dev->id); 88 + } 89 + 90 + qapi_event_send_memory_device_size_change(!!id, id, *size_p); 91 + } 92 + 93 + static void virtio_mem_pci_class_init(ObjectClass *klass, void *data) 94 + { 95 + DeviceClass *dc = DEVICE_CLASS(klass); 96 + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); 97 + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); 98 + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); 99 + 100 + k->realize = virtio_mem_pci_realize; 101 + set_bit(DEVICE_CATEGORY_MISC, dc->categories); 102 + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; 103 + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_MEM; 104 + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; 105 + pcidev_k->class_id = PCI_CLASS_OTHERS; 106 + 107 + mdc->get_addr = virtio_mem_pci_get_addr; 108 + mdc->set_addr = virtio_mem_pci_set_addr; 109 + mdc->get_plugged_size = virtio_mem_pci_get_plugged_size; 110 + mdc->get_memory_region = virtio_mem_pci_get_memory_region; 111 + mdc->fill_device_info = virtio_mem_pci_fill_device_info; 112 + } 113 + 114 + static void virtio_mem_pci_instance_init(Object *obj) 115 + { 116 + VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj); 117 + VirtIOMEMClass *vmc; 118 + VirtIOMEM *vmem; 119 + 120 + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), 121 + TYPE_VIRTIO_MEM); 122 + 123 + dev->size_change_notifier.notify = virtio_mem_pci_size_change_notify; 124 + vmem = VIRTIO_MEM(&dev->vdev); 125 + vmc = VIRTIO_MEM_GET_CLASS(vmem); 126 + /* 127 + * We never remove the notifier again, as we expect both devices to 128 + * disappear at the same time. 129 + */ 130 + vmc->add_size_change_notifier(vmem, &dev->size_change_notifier); 131 + 132 + object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, 133 + OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP); 134 + object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev), 135 + VIRTIO_MEM_SIZE_PROP); 136 + object_property_add_alias(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, 137 + OBJECT(&dev->vdev), 138 + VIRTIO_MEM_REQUESTED_SIZE_PROP); 139 + } 140 + 141 + static const VirtioPCIDeviceTypeInfo virtio_mem_pci_info = { 142 + .base_name = TYPE_VIRTIO_MEM_PCI, 143 + .generic_name = "virtio-mem-pci", 144 + .instance_size = sizeof(VirtIOMEMPCI), 145 + .instance_init = virtio_mem_pci_instance_init, 146 + .class_init = virtio_mem_pci_class_init, 147 + .interfaces = (InterfaceInfo[]) { 148 + { TYPE_MEMORY_DEVICE }, 149 + { } 150 + }, 151 + }; 152 + 153 + static void virtio_mem_pci_register_types(void) 154 + { 155 + virtio_pci_types_register(&virtio_mem_pci_info); 156 + } 157 + type_init(virtio_mem_pci_register_types)

+34

hw/virtio/virtio-mem-pci.h

··· 1 + /* 2 + * Virtio MEM PCI device 3 + * 4 + * Copyright (C) 2020 Red Hat, Inc. 5 + * 6 + * Authors: 7 + * David Hildenbrand <david@redhat.com> 8 + * 9 + * This work is licensed under the terms of the GNU GPL, version 2. 10 + * See the COPYING file in the top-level directory. 11 + */ 12 + 13 + #ifndef QEMU_VIRTIO_MEM_PCI_H 14 + #define QEMU_VIRTIO_MEM_PCI_H 15 + 16 + #include "hw/virtio/virtio-pci.h" 17 + #include "hw/virtio/virtio-mem.h" 18 + 19 + typedef struct VirtIOMEMPCI VirtIOMEMPCI; 20 + 21 + /* 22 + * virtio-mem-pci: This extends VirtioPCIProxy. 23 + */ 24 + #define TYPE_VIRTIO_MEM_PCI "virtio-mem-pci-base" 25 + #define VIRTIO_MEM_PCI(obj) \ 26 + OBJECT_CHECK(VirtIOMEMPCI, (obj), TYPE_VIRTIO_MEM_PCI) 27 + 28 + struct VirtIOMEMPCI { 29 + VirtIOPCIProxy parent_obj; 30 + VirtIOMEM vdev; 31 + Notifier size_change_notifier; 32 + }; 33 + 34 + #endif /* QEMU_VIRTIO_MEM_PCI_H */

+873

hw/virtio/virtio-mem.c

··· 1 + /* 2 + * Virtio MEM device 3 + * 4 + * Copyright (C) 2020 Red Hat, Inc. 5 + * 6 + * Authors: 7 + * David Hildenbrand <david@redhat.com> 8 + * 9 + * This work is licensed under the terms of the GNU GPL, version 2. 10 + * See the COPYING file in the top-level directory. 11 + */ 12 + 13 + #include "qemu/osdep.h" 14 + #include "qemu-common.h" 15 + #include "qemu/iov.h" 16 + #include "qemu/cutils.h" 17 + #include "qemu/error-report.h" 18 + #include "qemu/units.h" 19 + #include "sysemu/numa.h" 20 + #include "sysemu/sysemu.h" 21 + #include "sysemu/reset.h" 22 + #include "hw/virtio/virtio.h" 23 + #include "hw/virtio/virtio-bus.h" 24 + #include "hw/virtio/virtio-access.h" 25 + #include "hw/virtio/virtio-mem.h" 26 + #include "qapi/error.h" 27 + #include "qapi/visitor.h" 28 + #include "exec/ram_addr.h" 29 + #include "migration/misc.h" 30 + #include "hw/boards.h" 31 + #include "hw/qdev-properties.h" 32 + #include "config-devices.h" 33 + #include "trace.h" 34 + 35 + /* 36 + * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging 37 + * memory (e.g., 2MB on x86_64). 38 + */ 39 + #define VIRTIO_MEM_MIN_BLOCK_SIZE QEMU_VMALLOC_ALIGN 40 + /* 41 + * Size the usable region bigger than the requested size if possible. Esp. 42 + * Linux guests will only add (aligned) memory blocks in case they fully 43 + * fit into the usable region, but plug+online only a subset of the pages. 44 + * The memory block size corresponds mostly to the section size. 45 + * 46 + * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and 47 + * a section size of 1GB on arm64 (as long as the start address is properly 48 + * aligned, similar to ordinary DIMMs). 49 + * 50 + * We can change this at any time and maybe even make it configurable if 51 + * necessary (as the section size can change). But it's more likely that the 52 + * section size will rather get smaller and not bigger over time. 53 + */ 54 + #if defined(TARGET_X86_64) || defined(TARGET_I386) 55 + #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB)) 56 + #else 57 + #error VIRTIO_MEM_USABLE_EXTENT not defined 58 + #endif 59 + 60 + static bool virtio_mem_is_busy(void) 61 + { 62 + /* 63 + * Postcopy cannot handle concurrent discards and we don't want to migrate 64 + * pages on-demand with stale content when plugging new blocks. 65 + * 66 + * For precopy, we don't want unplugged blocks in our migration stream, and 67 + * when plugging new blocks, the page content might differ between source 68 + * and destination (observable by the guest when not initializing pages 69 + * after plugging them) until we're running on the destination (as we didn't 70 + * migrate these blocks when they were unplugged). 71 + */ 72 + return migration_in_incoming_postcopy() || !migration_is_idle(); 73 + } 74 + 75 + static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, 76 + uint64_t size, bool plugged) 77 + { 78 + const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; 79 + const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1; 80 + unsigned long found_bit; 81 + 82 + /* We fake a shorter bitmap to avoid searching too far. */ 83 + if (plugged) { 84 + found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit); 85 + } else { 86 + found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit); 87 + } 88 + return found_bit > last_bit; 89 + } 90 + 91 + static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, 92 + uint64_t size, bool plugged) 93 + { 94 + const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size; 95 + const unsigned long nbits = size / vmem->block_size; 96 + 97 + if (plugged) { 98 + bitmap_set(vmem->bitmap, bit, nbits); 99 + } else { 100 + bitmap_clear(vmem->bitmap, bit, nbits); 101 + } 102 + } 103 + 104 + static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem, 105 + struct virtio_mem_resp *resp) 106 + { 107 + VirtIODevice *vdev = VIRTIO_DEVICE(vmem); 108 + VirtQueue *vq = vmem->vq; 109 + 110 + trace_virtio_mem_send_response(le16_to_cpu(resp->type)); 111 + iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp)); 112 + 113 + virtqueue_push(vq, elem, sizeof(*resp)); 114 + virtio_notify(vdev, vq); 115 + } 116 + 117 + static void virtio_mem_send_response_simple(VirtIOMEM *vmem, 118 + VirtQueueElement *elem, 119 + uint16_t type) 120 + { 121 + struct virtio_mem_resp resp = { 122 + .type = cpu_to_le16(type), 123 + }; 124 + 125 + virtio_mem_send_response(vmem, elem, &resp); 126 + } 127 + 128 + static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size) 129 + { 130 + if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { 131 + return false; 132 + } 133 + if (gpa + size < gpa || !size) { 134 + return false; 135 + } 136 + if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) { 137 + return false; 138 + } 139 + if (gpa + size > vmem->addr + vmem->usable_region_size) { 140 + return false; 141 + } 142 + return true; 143 + } 144 + 145 + static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, 146 + uint64_t size, bool plug) 147 + { 148 + const uint64_t offset = start_gpa - vmem->addr; 149 + int ret; 150 + 151 + if (virtio_mem_is_busy()) { 152 + return -EBUSY; 153 + } 154 + 155 + if (!plug) { 156 + ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); 157 + if (ret) { 158 + error_report("Unexpected error discarding RAM: %s", 159 + strerror(-ret)); 160 + return -EBUSY; 161 + } 162 + } 163 + virtio_mem_set_bitmap(vmem, start_gpa, size, plug); 164 + return 0; 165 + } 166 + 167 + static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa, 168 + uint16_t nb_blocks, bool plug) 169 + { 170 + const uint64_t size = nb_blocks * vmem->block_size; 171 + int ret; 172 + 173 + if (!virtio_mem_valid_range(vmem, gpa, size)) { 174 + return VIRTIO_MEM_RESP_ERROR; 175 + } 176 + 177 + if (plug && (vmem->size + size > vmem->requested_size)) { 178 + return VIRTIO_MEM_RESP_NACK; 179 + } 180 + 181 + /* test if really all blocks are in the opposite state */ 182 + if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) { 183 + return VIRTIO_MEM_RESP_ERROR; 184 + } 185 + 186 + ret = virtio_mem_set_block_state(vmem, gpa, size, plug); 187 + if (ret) { 188 + return VIRTIO_MEM_RESP_BUSY; 189 + } 190 + if (plug) { 191 + vmem->size += size; 192 + } else { 193 + vmem->size -= size; 194 + } 195 + notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); 196 + return VIRTIO_MEM_RESP_ACK; 197 + } 198 + 199 + static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem, 200 + struct virtio_mem_req *req) 201 + { 202 + const uint64_t gpa = le64_to_cpu(req->u.plug.addr); 203 + const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks); 204 + uint16_t type; 205 + 206 + trace_virtio_mem_plug_request(gpa, nb_blocks); 207 + type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true); 208 + virtio_mem_send_response_simple(vmem, elem, type); 209 + } 210 + 211 + static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem, 212 + struct virtio_mem_req *req) 213 + { 214 + const uint64_t gpa = le64_to_cpu(req->u.unplug.addr); 215 + const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks); 216 + uint16_t type; 217 + 218 + trace_virtio_mem_unplug_request(gpa, nb_blocks); 219 + type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false); 220 + virtio_mem_send_response_simple(vmem, elem, type); 221 + } 222 + 223 + static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, 224 + uint64_t requested_size, 225 + bool can_shrink) 226 + { 227 + uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr), 228 + requested_size + VIRTIO_MEM_USABLE_EXTENT); 229 + 230 + if (!requested_size) { 231 + newsize = 0; 232 + } 233 + 234 + if (newsize < vmem->usable_region_size && !can_shrink) { 235 + return; 236 + } 237 + 238 + trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize); 239 + vmem->usable_region_size = newsize; 240 + } 241 + 242 + static int virtio_mem_unplug_all(VirtIOMEM *vmem) 243 + { 244 + RAMBlock *rb = vmem->memdev->mr.ram_block; 245 + int ret; 246 + 247 + if (virtio_mem_is_busy()) { 248 + return -EBUSY; 249 + } 250 + 251 + ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); 252 + if (ret) { 253 + error_report("Unexpected error discarding RAM: %s", strerror(-ret)); 254 + return -EBUSY; 255 + } 256 + bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); 257 + if (vmem->size) { 258 + vmem->size = 0; 259 + notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); 260 + } 261 + trace_virtio_mem_unplugged_all(); 262 + virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); 263 + return 0; 264 + } 265 + 266 + static void virtio_mem_unplug_all_request(VirtIOMEM *vmem, 267 + VirtQueueElement *elem) 268 + { 269 + trace_virtio_mem_unplug_all_request(); 270 + if (virtio_mem_unplug_all(vmem)) { 271 + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY); 272 + } else { 273 + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK); 274 + } 275 + } 276 + 277 + static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem, 278 + struct virtio_mem_req *req) 279 + { 280 + const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks); 281 + const uint64_t gpa = le64_to_cpu(req->u.state.addr); 282 + const uint64_t size = nb_blocks * vmem->block_size; 283 + struct virtio_mem_resp resp = { 284 + .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK), 285 + }; 286 + 287 + trace_virtio_mem_state_request(gpa, nb_blocks); 288 + if (!virtio_mem_valid_range(vmem, gpa, size)) { 289 + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR); 290 + return; 291 + } 292 + 293 + if (virtio_mem_test_bitmap(vmem, gpa, size, true)) { 294 + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED); 295 + } else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) { 296 + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED); 297 + } else { 298 + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED); 299 + } 300 + trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state)); 301 + virtio_mem_send_response(vmem, elem, &resp); 302 + } 303 + 304 + static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq) 305 + { 306 + const int len = sizeof(struct virtio_mem_req); 307 + VirtIOMEM *vmem = VIRTIO_MEM(vdev); 308 + VirtQueueElement *elem; 309 + struct virtio_mem_req req; 310 + uint16_t type; 311 + 312 + while (true) { 313 + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); 314 + if (!elem) { 315 + return; 316 + } 317 + 318 + if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) { 319 + virtio_error(vdev, "virtio-mem protocol violation: invalid request" 320 + " size: %d", len); 321 + g_free(elem); 322 + return; 323 + } 324 + 325 + if (iov_size(elem->in_sg, elem->in_num) < 326 + sizeof(struct virtio_mem_resp)) { 327 + virtio_error(vdev, "virtio-mem protocol violation: not enough space" 328 + " for response: %zu", 329 + iov_size(elem->in_sg, elem->in_num)); 330 + g_free(elem); 331 + return; 332 + } 333 + 334 + type = le16_to_cpu(req.type); 335 + switch (type) { 336 + case VIRTIO_MEM_REQ_PLUG: 337 + virtio_mem_plug_request(vmem, elem, &req); 338 + break; 339 + case VIRTIO_MEM_REQ_UNPLUG: 340 + virtio_mem_unplug_request(vmem, elem, &req); 341 + break; 342 + case VIRTIO_MEM_REQ_UNPLUG_ALL: 343 + virtio_mem_unplug_all_request(vmem, elem); 344 + break; 345 + case VIRTIO_MEM_REQ_STATE: 346 + virtio_mem_state_request(vmem, elem, &req); 347 + break; 348 + default: 349 + virtio_error(vdev, "virtio-mem protocol violation: unknown request" 350 + " type: %d", type); 351 + g_free(elem); 352 + return; 353 + } 354 + 355 + g_free(elem); 356 + } 357 + } 358 + 359 + static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data) 360 + { 361 + VirtIOMEM *vmem = VIRTIO_MEM(vdev); 362 + struct virtio_mem_config *config = (void *) config_data; 363 + 364 + config->block_size = cpu_to_le64(vmem->block_size); 365 + config->node_id = cpu_to_le16(vmem->node); 366 + config->requested_size = cpu_to_le64(vmem->requested_size); 367 + config->plugged_size = cpu_to_le64(vmem->size); 368 + config->addr = cpu_to_le64(vmem->addr); 369 + config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr)); 370 + config->usable_region_size = cpu_to_le64(vmem->usable_region_size); 371 + } 372 + 373 + static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features, 374 + Error **errp) 375 + { 376 + MachineState *ms = MACHINE(qdev_get_machine()); 377 + 378 + if (ms->numa_state) { 379 + #if defined(CONFIG_ACPI) 380 + virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM); 381 + #endif 382 + } 383 + return features; 384 + } 385 + 386 + static void virtio_mem_system_reset(void *opaque) 387 + { 388 + VirtIOMEM *vmem = VIRTIO_MEM(opaque); 389 + 390 + /* 391 + * During usual resets, we will unplug all memory and shrink the usable 392 + * region size. This is, however, not possible in all scenarios. Then, 393 + * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). 394 + */ 395 + virtio_mem_unplug_all(vmem); 396 + } 397 + 398 + static void virtio_mem_device_realize(DeviceState *dev, Error **errp) 399 + { 400 + MachineState *ms = MACHINE(qdev_get_machine()); 401 + int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0; 402 + VirtIODevice *vdev = VIRTIO_DEVICE(dev); 403 + VirtIOMEM *vmem = VIRTIO_MEM(dev); 404 + uint64_t page_size; 405 + RAMBlock *rb; 406 + int ret; 407 + 408 + if (!vmem->memdev) { 409 + error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP); 410 + return; 411 + } else if (host_memory_backend_is_mapped(vmem->memdev)) { 412 + char *path = object_get_canonical_path_component(OBJECT(vmem->memdev)); 413 + 414 + error_setg(errp, "'%s' property specifies a busy memdev: %s", 415 + VIRTIO_MEM_MEMDEV_PROP, path); 416 + g_free(path); 417 + return; 418 + } else if (!memory_region_is_ram(&vmem->memdev->mr) || 419 + memory_region_is_rom(&vmem->memdev->mr) || 420 + !vmem->memdev->mr.ram_block) { 421 + error_setg(errp, "'%s' property specifies an unsupported memdev", 422 + VIRTIO_MEM_MEMDEV_PROP); 423 + return; 424 + } 425 + 426 + if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) || 427 + (!nb_numa_nodes && vmem->node)) { 428 + error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds" 429 + "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP, 430 + vmem->node, nb_numa_nodes ? nb_numa_nodes : 1); 431 + return; 432 + } 433 + 434 + if (enable_mlock) { 435 + error_setg(errp, "Incompatible with mlock"); 436 + return; 437 + } 438 + 439 + rb = vmem->memdev->mr.ram_block; 440 + page_size = qemu_ram_pagesize(rb); 441 + 442 + if (vmem->block_size < page_size) { 443 + error_setg(errp, "'%s' property has to be at least the page size (0x%" 444 + PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); 445 + return; 446 + } else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { 447 + error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 448 + ")", VIRTIO_MEM_REQUESTED_SIZE_PROP, 449 + VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); 450 + return; 451 + } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr), 452 + vmem->block_size)) { 453 + error_setg(errp, "'%s' property memdev size has to be multiples of" 454 + "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP, 455 + VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); 456 + return; 457 + } 458 + 459 + if (ram_block_discard_require(true)) { 460 + error_setg(errp, "Discarding RAM is disabled"); 461 + return; 462 + } 463 + 464 + ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); 465 + if (ret) { 466 + error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); 467 + ram_block_discard_require(false); 468 + return; 469 + } 470 + 471 + virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); 472 + 473 + vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) / 474 + vmem->block_size; 475 + vmem->bitmap = bitmap_new(vmem->bitmap_size); 476 + 477 + virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM, 478 + sizeof(struct virtio_mem_config)); 479 + vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request); 480 + 481 + host_memory_backend_set_mapped(vmem->memdev, true); 482 + vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); 483 + qemu_register_reset(virtio_mem_system_reset, vmem); 484 + precopy_add_notifier(&vmem->precopy_notifier); 485 + } 486 + 487 + static void virtio_mem_device_unrealize(DeviceState *dev) 488 + { 489 + VirtIODevice *vdev = VIRTIO_DEVICE(dev); 490 + VirtIOMEM *vmem = VIRTIO_MEM(dev); 491 + 492 + precopy_remove_notifier(&vmem->precopy_notifier); 493 + qemu_unregister_reset(virtio_mem_system_reset, vmem); 494 + vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); 495 + host_memory_backend_set_mapped(vmem->memdev, false); 496 + virtio_del_queue(vdev, 0); 497 + virtio_cleanup(vdev); 498 + g_free(vmem->bitmap); 499 + ram_block_discard_require(false); 500 + } 501 + 502 + static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) 503 + { 504 + RAMBlock *rb = vmem->memdev->mr.ram_block; 505 + unsigned long first_zero_bit, last_zero_bit; 506 + uint64_t offset, length; 507 + int ret; 508 + 509 + /* Find consecutive unplugged blocks and discard the consecutive range. */ 510 + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); 511 + while (first_zero_bit < vmem->bitmap_size) { 512 + offset = first_zero_bit * vmem->block_size; 513 + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, 514 + first_zero_bit + 1) - 1; 515 + length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; 516 + 517 + ret = ram_block_discard_range(rb, offset, length); 518 + if (ret) { 519 + error_report("Unexpected error discarding RAM: %s", 520 + strerror(-ret)); 521 + return -EINVAL; 522 + } 523 + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, 524 + last_zero_bit + 2); 525 + } 526 + return 0; 527 + } 528 + 529 + static int virtio_mem_post_load(void *opaque, int version_id) 530 + { 531 + if (migration_in_incoming_postcopy()) { 532 + return 0; 533 + } 534 + 535 + return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque)); 536 + } 537 + 538 + typedef struct VirtIOMEMMigSanityChecks { 539 + VirtIOMEM *parent; 540 + uint64_t addr; 541 + uint64_t region_size; 542 + uint64_t block_size; 543 + uint32_t node; 544 + } VirtIOMEMMigSanityChecks; 545 + 546 + static int virtio_mem_mig_sanity_checks_pre_save(void *opaque) 547 + { 548 + VirtIOMEMMigSanityChecks *tmp = opaque; 549 + VirtIOMEM *vmem = tmp->parent; 550 + 551 + tmp->addr = vmem->addr; 552 + tmp->region_size = memory_region_size(&vmem->memdev->mr); 553 + tmp->block_size = vmem->block_size; 554 + tmp->node = vmem->node; 555 + return 0; 556 + } 557 + 558 + static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id) 559 + { 560 + VirtIOMEMMigSanityChecks *tmp = opaque; 561 + VirtIOMEM *vmem = tmp->parent; 562 + const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr); 563 + 564 + if (tmp->addr != vmem->addr) { 565 + error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, 566 + VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr); 567 + return -EINVAL; 568 + } 569 + /* 570 + * Note: Preparation for resizeable memory regions. The maximum size 571 + * of the memory region must not change during migration. 572 + */ 573 + if (tmp->region_size != new_region_size) { 574 + error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%" 575 + PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size, 576 + new_region_size); 577 + return -EINVAL; 578 + } 579 + if (tmp->block_size != vmem->block_size) { 580 + error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, 581 + VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size, 582 + vmem->block_size); 583 + return -EINVAL; 584 + } 585 + if (tmp->node != vmem->node) { 586 + error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32, 587 + VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node); 588 + return -EINVAL; 589 + } 590 + return 0; 591 + } 592 + 593 + static const VMStateDescription vmstate_virtio_mem_sanity_checks = { 594 + .name = "virtio-mem-device/sanity-checks", 595 + .pre_save = virtio_mem_mig_sanity_checks_pre_save, 596 + .post_load = virtio_mem_mig_sanity_checks_post_load, 597 + .fields = (VMStateField[]) { 598 + VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks), 599 + VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks), 600 + VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks), 601 + VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks), 602 + VMSTATE_END_OF_LIST(), 603 + }, 604 + }; 605 + 606 + static const VMStateDescription vmstate_virtio_mem_device = { 607 + .name = "virtio-mem-device", 608 + .minimum_version_id = 1, 609 + .version_id = 1, 610 + .post_load = virtio_mem_post_load, 611 + .fields = (VMStateField[]) { 612 + VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, 613 + vmstate_virtio_mem_sanity_checks), 614 + VMSTATE_UINT64(usable_region_size, VirtIOMEM), 615 + VMSTATE_UINT64(size, VirtIOMEM), 616 + VMSTATE_UINT64(requested_size, VirtIOMEM), 617 + VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size), 618 + VMSTATE_END_OF_LIST() 619 + }, 620 + }; 621 + 622 + static const VMStateDescription vmstate_virtio_mem = { 623 + .name = "virtio-mem", 624 + .minimum_version_id = 1, 625 + .version_id = 1, 626 + .fields = (VMStateField[]) { 627 + VMSTATE_VIRTIO_DEVICE, 628 + VMSTATE_END_OF_LIST() 629 + }, 630 + }; 631 + 632 + static void virtio_mem_fill_device_info(const VirtIOMEM *vmem, 633 + VirtioMEMDeviceInfo *vi) 634 + { 635 + vi->memaddr = vmem->addr; 636 + vi->node = vmem->node; 637 + vi->requested_size = vmem->requested_size; 638 + vi->size = vmem->size; 639 + vi->max_size = memory_region_size(&vmem->memdev->mr); 640 + vi->block_size = vmem->block_size; 641 + vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev)); 642 + } 643 + 644 + static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp) 645 + { 646 + if (!vmem->memdev) { 647 + error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP); 648 + return NULL; 649 + } 650 + 651 + return &vmem->memdev->mr; 652 + } 653 + 654 + static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem, 655 + Notifier *notifier) 656 + { 657 + notifier_list_add(&vmem->size_change_notifiers, notifier); 658 + } 659 + 660 + static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem, 661 + Notifier *notifier) 662 + { 663 + notifier_remove(notifier); 664 + } 665 + 666 + static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name, 667 + void *opaque, Error **errp) 668 + { 669 + const VirtIOMEM *vmem = VIRTIO_MEM(obj); 670 + uint64_t value = vmem->size; 671 + 672 + visit_type_size(v, name, &value, errp); 673 + } 674 + 675 + static void virtio_mem_get_requested_size(Object *obj, Visitor *v, 676 + const char *name, void *opaque, 677 + Error **errp) 678 + { 679 + const VirtIOMEM *vmem = VIRTIO_MEM(obj); 680 + uint64_t value = vmem->requested_size; 681 + 682 + visit_type_size(v, name, &value, errp); 683 + } 684 + 685 + static void virtio_mem_set_requested_size(Object *obj, Visitor *v, 686 + const char *name, void *opaque, 687 + Error **errp) 688 + { 689 + VirtIOMEM *vmem = VIRTIO_MEM(obj); 690 + Error *err = NULL; 691 + uint64_t value; 692 + 693 + visit_type_size(v, name, &value, &err); 694 + if (err) { 695 + error_propagate(errp, err); 696 + return; 697 + } 698 + 699 + /* 700 + * The block size and memory backend are not fixed until the device was 701 + * realized. realize() will verify these properties then. 702 + */ 703 + if (DEVICE(obj)->realized) { 704 + if (!QEMU_IS_ALIGNED(value, vmem->block_size)) { 705 + error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64 706 + ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP, 707 + vmem->block_size); 708 + return; 709 + } else if (value > memory_region_size(&vmem->memdev->mr)) { 710 + error_setg(errp, "'%s' cannot exceed the memory backend size" 711 + "(0x%" PRIx64 ")", name, 712 + memory_region_size(&vmem->memdev->mr)); 713 + return; 714 + } 715 + 716 + if (value != vmem->requested_size) { 717 + virtio_mem_resize_usable_region(vmem, value, false); 718 + vmem->requested_size = value; 719 + } 720 + /* 721 + * Trigger a config update so the guest gets notified. We trigger 722 + * even if the size didn't change (especially helpful for debugging). 723 + */ 724 + virtio_notify_config(VIRTIO_DEVICE(vmem)); 725 + } else { 726 + vmem->requested_size = value; 727 + } 728 + } 729 + 730 + static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name, 731 + void *opaque, Error **errp) 732 + { 733 + const VirtIOMEM *vmem = VIRTIO_MEM(obj); 734 + uint64_t value = vmem->block_size; 735 + 736 + visit_type_size(v, name, &value, errp); 737 + } 738 + 739 + static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, 740 + void *opaque, Error **errp) 741 + { 742 + VirtIOMEM *vmem = VIRTIO_MEM(obj); 743 + Error *err = NULL; 744 + uint64_t value; 745 + 746 + if (DEVICE(obj)->realized) { 747 + error_setg(errp, "'%s' cannot be changed", name); 748 + return; 749 + } 750 + 751 + visit_type_size(v, name, &value, &err); 752 + if (err) { 753 + error_propagate(errp, err); 754 + return; 755 + } 756 + 757 + if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) { 758 + error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name, 759 + VIRTIO_MEM_MIN_BLOCK_SIZE); 760 + return; 761 + } else if (!is_power_of_2(value)) { 762 + error_setg(errp, "'%s' property has to be a power of two", name); 763 + return; 764 + } 765 + vmem->block_size = value; 766 + } 767 + 768 + static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) 769 + { 770 + void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block); 771 + unsigned long first_zero_bit, last_zero_bit; 772 + uint64_t offset, length; 773 + 774 + /* 775 + * Find consecutive unplugged blocks and exclude them from migration. 776 + * 777 + * Note: Blocks cannot get (un)plugged during precopy, no locking needed. 778 + */ 779 + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); 780 + while (first_zero_bit < vmem->bitmap_size) { 781 + offset = first_zero_bit * vmem->block_size; 782 + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, 783 + first_zero_bit + 1) - 1; 784 + length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; 785 + 786 + qemu_guest_free_page_hint(host + offset, length); 787 + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, 788 + last_zero_bit + 2); 789 + } 790 + } 791 + 792 + static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data) 793 + { 794 + VirtIOMEM *vmem = container_of(n, VirtIOMEM, precopy_notifier); 795 + PrecopyNotifyData *pnd = data; 796 + 797 + switch (pnd->reason) { 798 + case PRECOPY_NOTIFY_SETUP: 799 + precopy_enable_free_page_optimization(); 800 + break; 801 + case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC: 802 + virtio_mem_precopy_exclude_unplugged(vmem); 803 + break; 804 + default: 805 + break; 806 + } 807 + 808 + return 0; 809 + } 810 + 811 + static void virtio_mem_instance_init(Object *obj) 812 + { 813 + VirtIOMEM *vmem = VIRTIO_MEM(obj); 814 + 815 + vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE; 816 + notifier_list_init(&vmem->size_change_notifiers); 817 + vmem->precopy_notifier.notify = virtio_mem_precopy_notify; 818 + 819 + object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, 820 + NULL, NULL, NULL); 821 + object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size", 822 + virtio_mem_get_requested_size, 823 + virtio_mem_set_requested_size, NULL, NULL); 824 + object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size", 825 + virtio_mem_get_block_size, virtio_mem_set_block_size, 826 + NULL, NULL); 827 + } 828 + 829 + static Property virtio_mem_properties[] = { 830 + DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0), 831 + DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0), 832 + DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev, 833 + TYPE_MEMORY_BACKEND, HostMemoryBackend *), 834 + DEFINE_PROP_END_OF_LIST(), 835 + }; 836 + 837 + static void virtio_mem_class_init(ObjectClass *klass, void *data) 838 + { 839 + DeviceClass *dc = DEVICE_CLASS(klass); 840 + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); 841 + VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); 842 + 843 + device_class_set_props(dc, virtio_mem_properties); 844 + dc->vmsd = &vmstate_virtio_mem; 845 + 846 + set_bit(DEVICE_CATEGORY_MISC, dc->categories); 847 + vdc->realize = virtio_mem_device_realize; 848 + vdc->unrealize = virtio_mem_device_unrealize; 849 + vdc->get_config = virtio_mem_get_config; 850 + vdc->get_features = virtio_mem_get_features; 851 + vdc->vmsd = &vmstate_virtio_mem_device; 852 + 853 + vmc->fill_device_info = virtio_mem_fill_device_info; 854 + vmc->get_memory_region = virtio_mem_get_memory_region; 855 + vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; 856 + vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; 857 + } 858 + 859 + static const TypeInfo virtio_mem_info = { 860 + .name = TYPE_VIRTIO_MEM, 861 + .parent = TYPE_VIRTIO_DEVICE, 862 + .instance_size = sizeof(VirtIOMEM), 863 + .instance_init = virtio_mem_instance_init, 864 + .class_init = virtio_mem_class_init, 865 + .class_size = sizeof(VirtIOMEMClass), 866 + }; 867 + 868 + static void virtio_register_types(void) 869 + { 870 + type_register_static(&virtio_mem_info); 871 + } 872 + 873 + type_init(virtio_register_types)

+13

hw/virtio/virtio-pci.c

··· 1107 1107 return pci_get_address_space(dev); 1108 1108 } 1109 1109 1110 + static bool virtio_pci_queue_enabled(DeviceState *d, int n) 1111 + { 1112 + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); 1113 + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); 1114 + 1115 + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1116 + return proxy->vqs[vdev->queue_sel].enabled; 1117 + } 1118 + 1119 + return virtio_queue_enabled(vdev, n); 1120 + } 1121 + 1110 1122 static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, 1111 1123 struct virtio_pci_cap *cap) 1112 1124 { ··· 2064 2076 k->ioeventfd_enabled = virtio_pci_ioeventfd_enabled; 2065 2077 k->ioeventfd_assign = virtio_pci_ioeventfd_assign; 2066 2078 k->get_dma_as = virtio_pci_get_dma_as; 2079 + k->queue_enabled = virtio_pci_queue_enabled; 2067 2080 } 2068 2081 2069 2082 static const TypeInfo virtio_pci_bus_info = {

+6

hw/virtio/virtio.c

··· 3286 3286 3287 3287 bool virtio_queue_enabled(VirtIODevice *vdev, int n) 3288 3288 { 3289 + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); 3290 + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); 3291 + 3292 + if (k->queue_enabled) { 3293 + return k->queue_enabled(qbus->parent, n); 3294 + } 3289 3295 return virtio_queue_get_desc_addr(vdev, n) != 0; 3290 3296 } 3291 3297

+41

include/exec/memory.h

··· 2478 2478 } 2479 2479 #endif 2480 2480 2481 + /* 2482 + * Inhibit technologies that require discarding of pages in RAM blocks, e.g., 2483 + * to manage the actual amount of memory consumed by the VM (then, the memory 2484 + * provided by RAM blocks might be bigger than the desired memory consumption). 2485 + * This *must* be set if: 2486 + * - Discarding parts of a RAM blocks does not result in the change being 2487 + * reflected in the VM and the pages getting freed. 2488 + * - All memory in RAM blocks is pinned or duplicated, invaldiating any previous 2489 + * discards blindly. 2490 + * - Discarding parts of a RAM blocks will result in integrity issues (e.g., 2491 + * encrypted VMs). 2492 + * Technologies that only temporarily pin the current working set of a 2493 + * driver are fine, because we don't expect such pages to be discarded 2494 + * (esp. based on guest action like balloon inflation). 2495 + * 2496 + * This is *not* to be used to protect from concurrent discards (esp., 2497 + * postcopy). 2498 + * 2499 + * Returns 0 if successful. Returns -EBUSY if a technology that relies on 2500 + * discards to work reliably is active. 2501 + */ 2502 + int ram_block_discard_disable(bool state); 2503 + 2504 + /* 2505 + * Inhibit technologies that disable discarding of pages in RAM blocks. 2506 + * 2507 + * Returns 0 if successful. Returns -EBUSY if discards are already set to 2508 + * broken. 2509 + */ 2510 + int ram_block_discard_require(bool state); 2511 + 2512 + /* 2513 + * Test if discarding of memory in ram blocks is disabled. 2514 + */ 2515 + bool ram_block_discard_is_disabled(void); 2516 + 2517 + /* 2518 + * Test if discarding of memory in ram blocks is required to work reliably. 2519 + */ 2520 + bool ram_block_discard_is_required(void); 2521 + 2481 2522 #endif 2482 2523 2483 2524 #endif

+1

include/hw/boards.h

··· 207 207 const char **valid_cpu_types; 208 208 strList *allowed_dynamic_sysbus_devices; 209 209 bool auto_enable_numa_with_memhp; 210 + bool auto_enable_numa_with_memdev; 210 211 void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, 211 212 int nb_nodes, ram_addr_t size); 212 213 bool ignore_boot_device_suffixes;

+1

include/hw/pci/pci.h

··· 87 87 #define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012 88 88 #define PCI_DEVICE_ID_VIRTIO_PMEM 0x1013 89 89 #define PCI_DEVICE_ID_VIRTIO_IOMMU 0x1014 90 + #define PCI_DEVICE_ID_VIRTIO_MEM 0x1015 90 91 91 92 #define PCI_VENDOR_ID_REDHAT 0x1b36 92 93 #define PCI_DEVICE_ID_REDHAT_BRIDGE 0x0001

+2 -2

include/hw/vfio/vfio-common.h

··· 108 108 bool reset_works; 109 109 bool needs_reset; 110 110 bool no_mmap; 111 - bool balloon_allowed; 111 + bool ram_block_discard_allowed; 112 112 VFIODeviceOps *ops; 113 113 unsigned int num_irqs; 114 114 unsigned int num_regions; ··· 128 128 QLIST_HEAD(, VFIODevice) device_list; 129 129 QLIST_ENTRY(VFIOGroup) next; 130 130 QLIST_ENTRY(VFIOGroup) container_next; 131 - bool balloon_allowed; 131 + bool ram_block_discard_allowed; 132 132 } VFIOGroup; 133 133 134 134 typedef struct VFIODMABuf {

+18 -1

include/hw/virtio/vhost-backend.h

··· 17 17 VHOST_BACKEND_TYPE_NONE = 0, 18 18 VHOST_BACKEND_TYPE_KERNEL = 1, 19 19 VHOST_BACKEND_TYPE_USER = 2, 20 - VHOST_BACKEND_TYPE_MAX = 3, 20 + VHOST_BACKEND_TYPE_VDPA = 3, 21 + VHOST_BACKEND_TYPE_MAX = 4, 21 22 } VhostBackendType; 22 23 23 24 typedef enum VhostSetConfigType { ··· 34 35 struct vhost_vring_addr; 35 36 struct vhost_scsi_target; 36 37 struct vhost_iotlb_msg; 38 + struct vhost_virtqueue; 37 39 38 40 typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque); 39 41 typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev); ··· 112 114 typedef int (*vhost_set_inflight_fd_op)(struct vhost_dev *dev, 113 115 struct vhost_inflight *inflight); 114 116 117 + typedef int (*vhost_dev_start_op)(struct vhost_dev *dev, bool started); 118 + 119 + typedef int (*vhost_vq_get_addr_op)(struct vhost_dev *dev, 120 + struct vhost_vring_addr *addr, 121 + struct vhost_virtqueue *vq); 122 + 123 + typedef int (*vhost_get_device_id_op)(struct vhost_dev *dev, uint32_t *dev_id); 124 + 125 + typedef bool (*vhost_force_iommu_op)(struct vhost_dev *dev); 126 + 115 127 typedef struct VhostOps { 116 128 VhostBackendType backend_type; 117 129 vhost_backend_init vhost_backend_init; ··· 152 164 vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter; 153 165 vhost_get_inflight_fd_op vhost_get_inflight_fd; 154 166 vhost_set_inflight_fd_op vhost_set_inflight_fd; 167 + vhost_dev_start_op vhost_dev_start; 168 + vhost_vq_get_addr_op vhost_vq_get_addr; 169 + vhost_get_device_id_op vhost_get_device_id; 170 + vhost_force_iommu_op vhost_force_iommu; 155 171 } VhostOps; 156 172 157 173 extern const VhostOps user_ops; 174 + extern const VhostOps vdpa_ops; 158 175 159 176 int vhost_set_backend_type(struct vhost_dev *dev, 160 177 VhostBackendType backend_type);

+26

include/hw/virtio/vhost-vdpa.h

··· 1 + /* 2 + * vhost-vdpa.h 3 + * 4 + * Copyright(c) 2017-2018 Intel Corporation. 5 + * Copyright(c) 2020 Red Hat, Inc. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 + * See the COPYING file in the top-level directory. 9 + * 10 + */ 11 + 12 + #ifndef HW_VIRTIO_VHOST_VDPA_H 13 + #define HW_VIRTIO_VHOST_VDPA_H 14 + 15 + #include "hw/virtio/virtio.h" 16 + 17 + typedef struct vhost_vdpa { 18 + int device_fd; 19 + uint32_t msg_type; 20 + MemoryListener listener; 21 + } VhostVDPA; 22 + 23 + extern AddressSpace address_space_memory; 24 + extern int vhost_vdpa_get_device_id(struct vhost_dev *dev, 25 + uint32_t *device_id); 26 + #endif

+7

include/hw/virtio/vhost.h

··· 92 92 const VhostDevConfigOps *config_ops; 93 93 }; 94 94 95 + struct vhost_net { 96 + struct vhost_dev dev; 97 + struct vhost_virtqueue vqs[2]; 98 + int backend; 99 + NetClientState *nc; 100 + }; 101 + 95 102 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 96 103 VhostBackendType backend_type, 97 104 uint32_t busyloop_timeout);

+4

include/hw/virtio/virtio-bus.h

··· 84 84 int (*ioeventfd_assign)(DeviceState *d, EventNotifier *notifier, 85 85 int n, bool assign); 86 86 /* 87 + * Whether queue number n is enabled. 88 + */ 89 + bool (*queue_enabled)(DeviceState *d, int n); 90 + /* 87 91 * Does the transport have variable vring alignment? 88 92 * (ie can it ever call virtio_queue_set_align()?) 89 93 * Note that changing this will break migration for this transport.

+86

include/hw/virtio/virtio-mem.h

··· 1 + /* 2 + * Virtio MEM device 3 + * 4 + * Copyright (C) 2020 Red Hat, Inc. 5 + * 6 + * Authors: 7 + * David Hildenbrand <david@redhat.com> 8 + * 9 + * This work is licensed under the terms of the GNU GPL, version 2. 10 + * See the COPYING file in the top-level directory. 11 + */ 12 + 13 + #ifndef HW_VIRTIO_MEM_H 14 + #define HW_VIRTIO_MEM_H 15 + 16 + #include "standard-headers/linux/virtio_mem.h" 17 + #include "hw/virtio/virtio.h" 18 + #include "qapi/qapi-types-misc.h" 19 + #include "sysemu/hostmem.h" 20 + 21 + #define TYPE_VIRTIO_MEM "virtio-mem" 22 + 23 + #define VIRTIO_MEM(obj) \ 24 + OBJECT_CHECK(VirtIOMEM, (obj), TYPE_VIRTIO_MEM) 25 + #define VIRTIO_MEM_CLASS(oc) \ 26 + OBJECT_CLASS_CHECK(VirtIOMEMClass, (oc), TYPE_VIRTIO_MEM) 27 + #define VIRTIO_MEM_GET_CLASS(obj) \ 28 + OBJECT_GET_CLASS(VirtIOMEMClass, (obj), TYPE_VIRTIO_MEM) 29 + 30 + #define VIRTIO_MEM_MEMDEV_PROP "memdev" 31 + #define VIRTIO_MEM_NODE_PROP "node" 32 + #define VIRTIO_MEM_SIZE_PROP "size" 33 + #define VIRTIO_MEM_REQUESTED_SIZE_PROP "requested-size" 34 + #define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size" 35 + #define VIRTIO_MEM_ADDR_PROP "memaddr" 36 + 37 + typedef struct VirtIOMEM { 38 + VirtIODevice parent_obj; 39 + 40 + /* guest -> host request queue */ 41 + VirtQueue *vq; 42 + 43 + /* bitmap used to track unplugged memory */ 44 + int32_t bitmap_size; 45 + unsigned long *bitmap; 46 + 47 + /* assigned memory backend and memory region */ 48 + HostMemoryBackend *memdev; 49 + 50 + /* NUMA node */ 51 + uint32_t node; 52 + 53 + /* assigned address of the region in guest physical memory */ 54 + uint64_t addr; 55 + 56 + /* usable region size (<= region_size) */ 57 + uint64_t usable_region_size; 58 + 59 + /* actual size (how much the guest plugged) */ 60 + uint64_t size; 61 + 62 + /* requested size */ 63 + uint64_t requested_size; 64 + 65 + /* block size and alignment */ 66 + uint64_t block_size; 67 + 68 + /* notifiers to notify when "size" changes */ 69 + NotifierList size_change_notifiers; 70 + 71 + /* don't migrate unplugged memory */ 72 + NotifierWithReturn precopy_notifier; 73 + } VirtIOMEM; 74 + 75 + typedef struct VirtIOMEMClass { 76 + /* private */ 77 + VirtIODevice parent; 78 + 79 + /* public */ 80 + void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi); 81 + MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp); 82 + void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier); 83 + void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier); 84 + } VirtIOMEMClass; 85 + 86 + #endif

+1 -1

include/migration/colo.h

··· 25 25 bool migration_in_colo_state(void); 26 26 27 27 /* loadvm */ 28 - void migration_incoming_enable_colo(void); 28 + int migration_incoming_enable_colo(void); 29 29 void migration_incoming_disable_colo(void); 30 30 bool migration_incoming_colo_enabled(void); 31 31 void *colo_process_incoming_thread(void *opaque);

+2

include/migration/misc.h

··· 69 69 /* ...and after the device transmission */ 70 70 bool migration_in_postcopy_after_devices(MigrationState *); 71 71 void migration_global_dump(Monitor *mon); 72 + /* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */ 73 + bool migration_in_incoming_postcopy(void); 72 74 73 75 /* migration/block-dirty-bitmap.c */ 74 76 void dirty_bitmap_mig_init(void);

+1

include/net/net.h

··· 176 176 void net_socket_rs_init(SocketReadState *rs, 177 177 SocketReadStateFinalize *finalize, 178 178 bool vnet_hdr); 179 + NetClientState *qemu_get_peer(NetClientState *nc, int queue_index); 179 180 180 181 /* NIC info */ 181 182

+22

include/net/vhost-vdpa.h

··· 1 + /* 2 + * vhost-vdpa.h 3 + * 4 + * Copyright(c) 2017-2018 Intel Corporation. 5 + * Copyright(c) 2020 Red Hat, Inc. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 + * See the COPYING file in the top-level directory. 9 + * 10 + */ 11 + 12 + #ifndef VHOST_VDPA_H 13 + #define VHOST_VDPA_H 14 + 15 + #define TYPE_VHOST_VDPA "vhost-vdpa" 16 + 17 + struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState *nc); 18 + uint64_t vhost_vdpa_get_acked_features(NetClientState *nc); 19 + 20 + extern const int vdpa_feature_bits[]; 21 + 22 + #endif /* VHOST_VDPA_H */

+5

include/net/vhost_net.h

··· 28 28 uint64_t vhost_net_get_features(VHostNetState *net, uint64_t features); 29 29 void vhost_net_ack_features(VHostNetState *net, uint64_t features); 30 30 31 + int vhost_net_get_config(struct vhost_net *net, uint8_t *config, 32 + uint32_t config_len); 33 + 34 + int vhost_net_set_config(struct vhost_net *net, const uint8_t *data, 35 + uint32_t offset, uint32_t size, uint32_t flags); 31 36 bool vhost_net_virtqueue_pending(VHostNetState *net, int n); 32 37 void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, 33 38 int idx, bool mask);

-2

include/sysemu/balloon.h

··· 23 23 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, 24 24 QEMUBalloonStatus *stat_func, void *opaque); 25 25 void qemu_remove_balloon_handler(void *opaque); 26 - bool qemu_balloon_is_inhibited(void); 27 - void qemu_balloon_inhibit(bool state); 28 26 29 27 #endif

+14 -1

migration/migration.c

··· 338 338 339 339 void migration_incoming_disable_colo(void) 340 340 { 341 + ram_block_discard_disable(false); 341 342 migration_colo_enabled = false; 342 343 } 343 344 344 - void migration_incoming_enable_colo(void) 345 + int migration_incoming_enable_colo(void) 345 346 { 347 + if (ram_block_discard_disable(true)) { 348 + error_report("COLO: cannot disable RAM discard"); 349 + return -EBUSY; 350 + } 346 351 migration_colo_enabled = true; 352 + return 0; 347 353 } 348 354 349 355 void migrate_add_address(SocketAddress *address) ··· 1770 1776 bool migration_in_postcopy_after_devices(MigrationState *s) 1771 1777 { 1772 1778 return migration_in_postcopy() && s->postcopy_after_devices; 1779 + } 1780 + 1781 + bool migration_in_incoming_postcopy(void) 1782 + { 1783 + PostcopyState ps = postcopy_state_get(); 1784 + 1785 + return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END; 1773 1786 } 1774 1787 1775 1788 bool migration_is_idle(void)

-23

migration/postcopy-ram.c

··· 27 27 #include "qemu/notify.h" 28 28 #include "qemu/rcu.h" 29 29 #include "sysemu/sysemu.h" 30 - #include "sysemu/balloon.h" 31 30 #include "qemu/error-report.h" 32 31 #include "trace.h" 33 32 #include "hw/boards.h" ··· 521 520 } 522 521 523 522 /* 524 - * Manage a single vote to the QEMU balloon inhibitor for all postcopy usage, 525 - * last caller wins. 526 - */ 527 - static void postcopy_balloon_inhibit(bool state) 528 - { 529 - static bool cur_state = false; 530 - 531 - if (state != cur_state) { 532 - qemu_balloon_inhibit(state); 533 - cur_state = state; 534 - } 535 - } 536 - 537 - /* 538 523 * At the end of a migration where postcopy_ram_incoming_init was called. 539 524 */ 540 525 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) ··· 564 549 close(mis->userfault_event_fd); 565 550 mis->have_fault_thread = false; 566 551 } 567 - 568 - postcopy_balloon_inhibit(false); 569 552 570 553 if (enable_mlock) { 571 554 if (os_mlock() < 0) { ··· 1159 1142 return -e; 1160 1143 } 1161 1144 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); 1162 - 1163 - /* 1164 - * Ballooning can mark pages as absent while we're postcopying 1165 - * that would cause false userfaults. 1166 - */ 1167 - postcopy_balloon_inhibit(true); 1168 1145 1169 1146 trace_postcopy_ram_enable_notify(); 1170 1147

+16 -2

migration/rdma.c

··· 29 29 #include "qemu/sockets.h" 30 30 #include "qemu/bitmap.h" 31 31 #include "qemu/coroutine.h" 32 + #include "exec/memory.h" 32 33 #include <sys/socket.h> 33 34 #include <netdb.h> 34 35 #include <arpa/inet.h> ··· 4016 4017 Error *local_err = NULL; 4017 4018 4018 4019 trace_rdma_start_incoming_migration(); 4019 - rdma = qemu_rdma_data_init(host_port, &local_err); 4020 4020 4021 + /* Avoid ram_block_discard_disable(), cannot change during migration. */ 4022 + if (ram_block_discard_is_required()) { 4023 + error_setg(errp, "RDMA: cannot disable RAM discard"); 4024 + return; 4025 + } 4026 + 4027 + rdma = qemu_rdma_data_init(host_port, &local_err); 4021 4028 if (rdma == NULL) { 4022 4029 goto err; 4023 4030 } ··· 4066 4073 const char *host_port, Error **errp) 4067 4074 { 4068 4075 MigrationState *s = opaque; 4069 - RDMAContext *rdma = qemu_rdma_data_init(host_port, errp); 4070 4076 RDMAContext *rdma_return_path = NULL; 4077 + RDMAContext *rdma; 4071 4078 int ret = 0; 4072 4079 4080 + /* Avoid ram_block_discard_disable(), cannot change during migration. */ 4081 + if (ram_block_discard_is_required()) { 4082 + error_setg(errp, "RDMA: cannot disable RAM discard"); 4083 + return; 4084 + } 4085 + 4086 + rdma = qemu_rdma_data_init(host_port, errp); 4073 4087 if (rdma == NULL) { 4074 4088 goto err; 4075 4089 }

+9 -2

migration/savevm.c

··· 2111 2111 2112 2112 static int loadvm_process_enable_colo(MigrationIncomingState *mis) 2113 2113 { 2114 - migration_incoming_enable_colo(); 2115 - return colo_init_ram_cache(); 2114 + int ret = migration_incoming_enable_colo(); 2115 + 2116 + if (!ret) { 2117 + ret = colo_init_ram_cache(); 2118 + if (ret) { 2119 + migration_incoming_disable_colo(); 2120 + } 2121 + } 2122 + return ret; 2116 2123 } 2117 2124 2118 2125 /*

+16

monitor/hmp-cmds.c

··· 1821 1821 MemoryDeviceInfoList *info_list = qmp_query_memory_devices(&err); 1822 1822 MemoryDeviceInfoList *info; 1823 1823 VirtioPMEMDeviceInfo *vpi; 1824 + VirtioMEMDeviceInfo *vmi; 1824 1825 MemoryDeviceInfo *value; 1825 1826 PCDIMMDeviceInfo *di; 1826 1827 ··· 1854 1855 monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", vpi->memaddr); 1855 1856 monitor_printf(mon, " size: %" PRIu64 "\n", vpi->size); 1856 1857 monitor_printf(mon, " memdev: %s\n", vpi->memdev); 1858 + break; 1859 + case MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM: 1860 + vmi = value->u.virtio_mem.data; 1861 + monitor_printf(mon, "Memory device [%s]: \"%s\"\n", 1862 + MemoryDeviceInfoKind_str(value->type), 1863 + vmi->id ? vmi->id : ""); 1864 + monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", vmi->memaddr); 1865 + monitor_printf(mon, " node: %" PRId64 "\n", vmi->node); 1866 + monitor_printf(mon, " requested-size: %" PRIu64 "\n", 1867 + vmi->requested_size); 1868 + monitor_printf(mon, " size: %" PRIu64 "\n", vmi->size); 1869 + monitor_printf(mon, " max-size: %" PRIu64 "\n", vmi->max_size); 1870 + monitor_printf(mon, " block-size: %" PRIu64 "\n", 1871 + vmi->block_size); 1872 + monitor_printf(mon, " memdev: %s\n", vmi->memdev); 1857 1873 break; 1858 1874 default: 1859 1875 g_assert_not_reached();

+1

monitor/monitor.c

··· 235 235 [QAPI_EVENT_QUORUM_REPORT_BAD] = { 1000 * SCALE_MS }, 236 236 [QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS }, 237 237 [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, 238 + [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, 238 239 }; 239 240 240 241 /*

+1 -1

net/Makefile.objs

··· 26 26 tap-obj-y ?= tap-stub.o 27 27 common-obj-$(CONFIG_POSIX) += tap.o $(tap-obj-y) 28 28 common-obj-$(CONFIG_WIN32) += tap-win32.o 29 - 29 + common-obj-$(CONFIG_VHOST_NET_VDPA) += vhost-vdpa.o 30 30 vde.o-libs = $(VDE_LIBS) 31 31 32 32 common-obj-$(CONFIG_CAN_BUS) += can/

+2

net/clients.h

··· 61 61 int net_init_vhost_user(const Netdev *netdev, const char *name, 62 62 NetClientState *peer, Error **errp); 63 63 64 + int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 65 + NetClientState *peer, Error **errp); 64 66 #endif /* QEMU_NET_CLIENTS_H */

+10

net/net.c

··· 325 325 return nic->opaque; 326 326 } 327 327 328 + NetClientState *qemu_get_peer(NetClientState *nc, int queue_index) 329 + { 330 + assert(nc != NULL); 331 + NetClientState *ncs = nc + queue_index; 332 + return ncs->peer; 333 + } 334 + 328 335 static void qemu_cleanup_net_client(NetClientState *nc) 329 336 { 330 337 QTAILQ_REMOVE(&net_clients, nc, next); ··· 958 965 [NET_CLIENT_DRIVER_HUBPORT] = net_init_hubport, 959 966 #ifdef CONFIG_VHOST_NET_USER 960 967 [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user, 968 + #endif 969 + #ifdef CONFIG_VHOST_NET_VDPA 970 + [NET_CLIENT_DRIVER_VHOST_VDPA] = net_init_vhost_vdpa, 961 971 #endif 962 972 #ifdef CONFIG_L2TPV3 963 973 [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,

+228

net/vhost-vdpa.c

··· 1 + /* 2 + * vhost-vdpa.c 3 + * 4 + * Copyright(c) 2017-2018 Intel Corporation. 5 + * Copyright(c) 2020 Red Hat, Inc. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 + * See the COPYING file in the top-level directory. 9 + * 10 + */ 11 + 12 + #include "qemu/osdep.h" 13 + #include "clients.h" 14 + #include "net/vhost_net.h" 15 + #include "net/vhost-vdpa.h" 16 + #include "hw/virtio/vhost-vdpa.h" 17 + #include "qemu/config-file.h" 18 + #include "qemu/error-report.h" 19 + #include "qemu/option.h" 20 + #include "qapi/error.h" 21 + #include <sys/ioctl.h> 22 + #include <err.h> 23 + #include "standard-headers/linux/virtio_net.h" 24 + #include "monitor/monitor.h" 25 + #include "hw/virtio/vhost.h" 26 + 27 + /* Todo:need to add the multiqueue support here */ 28 + typedef struct VhostVDPAState { 29 + NetClientState nc; 30 + struct vhost_vdpa vhost_vdpa; 31 + VHostNetState *vhost_net; 32 + uint64_t acked_features; 33 + bool started; 34 + } VhostVDPAState; 35 + 36 + const int vdpa_feature_bits[] = { 37 + VIRTIO_F_NOTIFY_ON_EMPTY, 38 + VIRTIO_RING_F_INDIRECT_DESC, 39 + VIRTIO_RING_F_EVENT_IDX, 40 + VIRTIO_F_ANY_LAYOUT, 41 + VIRTIO_F_VERSION_1, 42 + VIRTIO_NET_F_CSUM, 43 + VIRTIO_NET_F_GUEST_CSUM, 44 + VIRTIO_NET_F_GSO, 45 + VIRTIO_NET_F_GUEST_TSO4, 46 + VIRTIO_NET_F_GUEST_TSO6, 47 + VIRTIO_NET_F_GUEST_ECN, 48 + VIRTIO_NET_F_GUEST_UFO, 49 + VIRTIO_NET_F_HOST_TSO4, 50 + VIRTIO_NET_F_HOST_TSO6, 51 + VIRTIO_NET_F_HOST_ECN, 52 + VIRTIO_NET_F_HOST_UFO, 53 + VIRTIO_NET_F_MRG_RXBUF, 54 + VIRTIO_NET_F_MTU, 55 + VIRTIO_F_IOMMU_PLATFORM, 56 + VIRTIO_F_RING_PACKED, 57 + VIRTIO_NET_F_GUEST_ANNOUNCE, 58 + VHOST_INVALID_FEATURE_BIT 59 + }; 60 + 61 + VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc) 62 + { 63 + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 64 + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 65 + return s->vhost_net; 66 + } 67 + 68 + uint64_t vhost_vdpa_get_acked_features(NetClientState *nc) 69 + { 70 + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 71 + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 72 + s->acked_features = vhost_net_get_acked_features(s->vhost_net); 73 + 74 + return s->acked_features; 75 + } 76 + 77 + static int vhost_vdpa_net_check_device_id(struct vhost_net *net) 78 + { 79 + uint32_t device_id; 80 + int ret; 81 + struct vhost_dev *hdev; 82 + 83 + hdev = (struct vhost_dev *)&net->dev; 84 + ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id); 85 + if (device_id != VIRTIO_ID_NET) { 86 + return -ENOTSUP; 87 + } 88 + return ret; 89 + } 90 + 91 + static void vhost_vdpa_del(NetClientState *ncs) 92 + { 93 + VhostVDPAState *s; 94 + assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 95 + s = DO_UPCAST(VhostVDPAState, nc, ncs); 96 + if (s->vhost_net) { 97 + vhost_net_cleanup(s->vhost_net); 98 + } 99 + } 100 + 101 + static int vhost_vdpa_add(NetClientState *ncs, void *be) 102 + { 103 + VhostNetOptions options; 104 + struct vhost_net *net = NULL; 105 + VhostVDPAState *s; 106 + int ret; 107 + 108 + options.backend_type = VHOST_BACKEND_TYPE_VDPA; 109 + assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 110 + s = DO_UPCAST(VhostVDPAState, nc, ncs); 111 + options.net_backend = ncs; 112 + options.opaque = be; 113 + options.busyloop_timeout = 0; 114 + 115 + net = vhost_net_init(&options); 116 + if (!net) { 117 + error_report("failed to init vhost_net for queue"); 118 + goto err; 119 + } 120 + if (s->vhost_net) { 121 + vhost_net_cleanup(s->vhost_net); 122 + g_free(s->vhost_net); 123 + } 124 + s->vhost_net = net; 125 + ret = vhost_vdpa_net_check_device_id(net); 126 + if (ret) { 127 + goto err; 128 + } 129 + return 0; 130 + err: 131 + if (net) { 132 + vhost_net_cleanup(net); 133 + } 134 + vhost_vdpa_del(ncs); 135 + return -1; 136 + } 137 + 138 + static void vhost_vdpa_cleanup(NetClientState *nc) 139 + { 140 + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 141 + 142 + if (s->vhost_net) { 143 + vhost_net_cleanup(s->vhost_net); 144 + g_free(s->vhost_net); 145 + s->vhost_net = NULL; 146 + } 147 + } 148 + 149 + static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc) 150 + { 151 + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 152 + 153 + return true; 154 + } 155 + 156 + static bool vhost_vdpa_has_ufo(NetClientState *nc) 157 + { 158 + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA); 159 + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); 160 + uint64_t features = 0; 161 + features |= (1ULL << VIRTIO_NET_F_HOST_UFO); 162 + features = vhost_net_get_features(s->vhost_net, features); 163 + return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO)); 164 + 165 + } 166 + 167 + static NetClientInfo net_vhost_vdpa_info = { 168 + .type = NET_CLIENT_DRIVER_VHOST_VDPA, 169 + .size = sizeof(VhostVDPAState), 170 + .cleanup = vhost_vdpa_cleanup, 171 + .has_vnet_hdr = vhost_vdpa_has_vnet_hdr, 172 + .has_ufo = vhost_vdpa_has_ufo, 173 + }; 174 + 175 + static int net_vhost_vdpa_init(NetClientState *peer, const char *device, 176 + const char *name, const char *vhostdev) 177 + { 178 + NetClientState *nc = NULL; 179 + VhostVDPAState *s; 180 + int vdpa_device_fd = -1; 181 + int ret = 0; 182 + assert(name); 183 + nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, name); 184 + snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA); 185 + nc->queue_index = 0; 186 + s = DO_UPCAST(VhostVDPAState, nc, nc); 187 + vdpa_device_fd = qemu_open(vhostdev, O_RDWR); 188 + if (vdpa_device_fd == -1) { 189 + return -errno; 190 + } 191 + s->vhost_vdpa.device_fd = vdpa_device_fd; 192 + ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa); 193 + assert(s->vhost_net); 194 + return ret; 195 + } 196 + 197 + static int net_vhost_check_net(void *opaque, QemuOpts *opts, Error **errp) 198 + { 199 + const char *name = opaque; 200 + const char *driver, *netdev; 201 + 202 + driver = qemu_opt_get(opts, "driver"); 203 + netdev = qemu_opt_get(opts, "netdev"); 204 + if (!driver || !netdev) { 205 + return 0; 206 + } 207 + if (strcmp(netdev, name) == 0 && 208 + !g_str_has_prefix(driver, "virtio-net-")) { 209 + error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*"); 210 + return -1; 211 + } 212 + return 0; 213 + } 214 + 215 + int net_init_vhost_vdpa(const Netdev *netdev, const char *name, 216 + NetClientState *peer, Error **errp) 217 + { 218 + const NetdevVhostVDPAOptions *opts; 219 + 220 + assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA); 221 + opts = &netdev->u.vhost_vdpa; 222 + /* verify net frontend */ 223 + if (qemu_opts_foreach(qemu_find_opts("device"), net_vhost_check_net, 224 + (char *)name, errp)) { 225 + return -1; 226 + } 227 + return net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, opts->vhostdev); 228 + }

+63 -1

qapi/misc.json

··· 1357 1357 } 1358 1358 1359 1359 ## 1360 + # @VirtioMEMDeviceInfo: 1361 + # 1362 + # VirtioMEMDevice state information 1363 + # 1364 + # @id: device's ID 1365 + # 1366 + # @memaddr: physical address in memory, where device is mapped 1367 + # 1368 + # @requested-size: the user requested size of the device 1369 + # 1370 + # @size: the (current) size of memory that the device provides 1371 + # 1372 + # @max-size: the maximum size of memory that the device can provide 1373 + # 1374 + # @block-size: the block size of memory that the device provides 1375 + # 1376 + # @node: NUMA node number where device is assigned to 1377 + # 1378 + # @memdev: memory backend linked with the region 1379 + # 1380 + # Since: 5.1 1381 + ## 1382 + { 'struct': 'VirtioMEMDeviceInfo', 1383 + 'data': { '*id': 'str', 1384 + 'memaddr': 'size', 1385 + 'requested-size': 'size', 1386 + 'size': 'size', 1387 + 'max-size': 'size', 1388 + 'block-size': 'size', 1389 + 'node': 'int', 1390 + 'memdev': 'str' 1391 + } 1392 + } 1393 + 1394 + ## 1360 1395 # @MemoryDeviceInfo: 1361 1396 # 1362 1397 # Union containing information about a memory device 1363 1398 # 1364 1399 # nvdimm is included since 2.12. virtio-pmem is included since 4.1. 1400 + # virtio-mem is included since 5.1. 1365 1401 # 1366 1402 # Since: 2.1 1367 1403 ## 1368 1404 { 'union': 'MemoryDeviceInfo', 1369 1405 'data': { 'dimm': 'PCDIMMDeviceInfo', 1370 1406 'nvdimm': 'PCDIMMDeviceInfo', 1371 - 'virtio-pmem': 'VirtioPMEMDeviceInfo' 1407 + 'virtio-pmem': 'VirtioPMEMDeviceInfo', 1408 + 'virtio-mem': 'VirtioMEMDeviceInfo' 1372 1409 } 1373 1410 } 1374 1411 ··· 1396 1433 # 1397 1434 ## 1398 1435 { 'command': 'query-memory-devices', 'returns': ['MemoryDeviceInfo'] } 1436 + 1437 + ## 1438 + # @MEMORY_DEVICE_SIZE_CHANGE: 1439 + # 1440 + # Emitted when the size of a memory device changes. Only emitted for memory 1441 + # devices that can actually change the size (e.g., virtio-mem due to guest 1442 + # action). 1443 + # 1444 + # @id: device's ID 1445 + # @size: the new size of memory that the device provides 1446 + # 1447 + # Note: this event is rate-limited. 1448 + # 1449 + # Since: 5.1 1450 + # 1451 + # Example: 1452 + # 1453 + # <- { "event": "MEMORY_DEVICE_SIZE_CHANGE", 1454 + # "data": { "id": "vm0", "size": 1073741824}, 1455 + # "timestamp": { "seconds": 1588168529, "microseconds": 201316 } } 1456 + # 1457 + ## 1458 + { 'event': 'MEMORY_DEVICE_SIZE_CHANGE', 1459 + 'data': { '*id': 'str', 'size': 'size' } } 1460 + 1399 1461 1400 1462 ## 1401 1463 # @MEM_UNPLUG_ERROR:

+26 -2

qapi/net.json

··· 429 429 '*queues': 'int' } } 430 430 431 431 ## 432 + # @NetdevVhostVDPAOptions: 433 + # 434 + # Vhost-vdpa network backend 435 + # 436 + # vDPA device is a device that uses a datapath which complies with the virtio 437 + # specifications with a vendor specific control path. 438 + # 439 + # @vhostdev: path of vhost-vdpa device 440 + # (default:'/dev/vhost-vdpa-0') 441 + # 442 + # @queues: number of queues to be created for multiqueue vhost-vdpa 443 + # (default: 1) 444 + # 445 + # Since: 5.1 446 + ## 447 + { 'struct': 'NetdevVhostVDPAOptions', 448 + 'data': { 449 + '*vhostdev': 'str', 450 + '*queues': 'int' } } 451 + 452 + ## 432 453 # @NetClientDriver: 433 454 # 434 455 # Available netdev drivers. 435 456 # 436 457 # Since: 2.7 458 + # 459 + # @vhost-vdpa since 5.1 437 460 ## 438 461 { 'enum': 'NetClientDriver', 439 462 'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 440 - 'bridge', 'hubport', 'netmap', 'vhost-user' ] } 463 + 'bridge', 'hubport', 'netmap', 'vhost-user', 'vhost-vdpa' ] } 441 464 442 465 ## 443 466 # @Netdev: ··· 465 488 'bridge': 'NetdevBridgeOptions', 466 489 'hubport': 'NetdevHubPortOptions', 467 490 'netmap': 'NetdevNetmapOptions', 468 - 'vhost-user': 'NetdevVhostUserOptions' } } 491 + 'vhost-user': 'NetdevVhostUserOptions', 492 + 'vhost-vdpa': 'NetdevVhostVDPAOptions' } } 469 493 470 494 ## 471 495 # @NetFilterDirection:

+12

qemu-options.hx

··· 2419 2419 "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n" 2420 2420 " configure a vhost-user network, backed by a chardev 'dev'\n" 2421 2421 #endif 2422 + #ifdef __linux__ 2423 + "-netdev vhost-vdpa,id=str,vhostdev=/path/to/dev\n" 2424 + " configure a vhost-vdpa network,Establish a vhost-vdpa netdev\n" 2425 + #endif 2422 2426 "-netdev hubport,id=str,hubid=n[,netdev=nd]\n" 2423 2427 " configure a hub port on the hub with ID 'n'\n", QEMU_ARCH_ALL) 2424 2428 DEF("nic", HAS_ARG, QEMU_OPTION_nic, ··· 2896 2900 -chardev socket,id=chr0,path=/path/to/socket \ 2897 2901 -netdev type=vhost-user,id=net0,chardev=chr0 \ 2898 2902 -device virtio-net-pci,netdev=net0 2903 + 2904 + ``-netdev vhost-vdpa,vhostdev=/path/to/dev`` 2905 + Establish a vhost-vdpa netdev. 2906 + 2907 + vDPA device is a device that uses a datapath which complies with 2908 + the virtio specifications with a vendor specific control path. 2909 + vDPA devices can be both physically located on the hardware or 2910 + emulated by software. 2899 2911 2900 2912 ``-netdev hubport,id=id,hubid=hubid[,netdev=nd]`` 2901 2913 Create a hub port on the emulated hub with ID hubid.

+7

target/i386/sev.c

··· 680 680 uint32_t host_cbitpos; 681 681 struct sev_user_data_status status = {}; 682 682 683 + ret = ram_block_discard_disable(true); 684 + if (ret) { 685 + error_report("%s: cannot disable RAM discard", __func__); 686 + return NULL; 687 + } 688 + 683 689 sev = lookup_sev_guest_info(id); 684 690 if (!sev) { 685 691 error_report("%s: '%s' is not a valid '%s' object", ··· 751 757 return sev; 752 758 err: 753 759 sev_guest = NULL; 760 + ram_block_discard_disable(false); 754 761 return NULL; 755 762 } 756 763

+52

tests/data/acpi/disassemle-aml.sh

··· 1 + #!/usr/bin/bash 2 + 3 + outdir= 4 + while getopts "o:" arg; do 5 + case ${arg} in 6 + o ) 7 + outdir=$OPTARG 8 + ;; 9 + \? ) 10 + echo "Usage: ./tests/data/acpi/disassemle-aml.sh [-o <output-directory>]" 11 + exit 1 12 + ;; 13 + 14 + esac 15 + done 16 + 17 + for machine in tests/data/acpi/* 18 + do 19 + if [[ ! -d "$machine" ]]; 20 + then 21 + continue 22 + fi 23 + 24 + if [[ "${outdir}" ]]; 25 + then 26 + mkdir -p "${outdir}"/${machine} || exit $? 27 + fi 28 + for aml in $machine/* 29 + do 30 + if [[ "$aml" == $machine/*.dsl ]]; 31 + then 32 + continue 33 + fi 34 + if [[ "$aml" == $machine/SSDT*.* ]]; 35 + then 36 + dsdt=${aml/SSDT*./DSDT.} 37 + extra="-e ${dsdt}" 38 + elif [[ "$aml" == $machine/SSDT* ]]; 39 + then 40 + dsdt=${aml/SSDT*/DSDT}; 41 + extra="-e ${dsdt}" 42 + else 43 + extra="" 44 + fi 45 + asl=${aml}.dsl 46 + if [[ "${outdir}" ]]; 47 + then 48 + asl="${outdir}"/${machine}/${asl} 49 + fi 50 + iasl -d -p ${asl} ${extra} ${aml} 51 + done 52 + done

+1

tests/data/acpi/rebuild-expected-aml.sh

··· 36 36 echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h 37 37 38 38 echo "The files were rebuilt and can be added to git." 39 + echo "You can use ${SRC_PATH}/tests/data/acpi/disassemle-aml.sh to disassemble them to ASL." 39 40 40 41 if [ -z "$old_allowed_dif" ]; then 41 42 echo "Note! Please do not commit expected files with source changes"

+1 -1

tests/qtest/migration-test.c

··· 1211 1211 * without throttling. 1212 1212 */ 1213 1213 migrate_set_parameter_int(from, "downtime-limit", 1); 1214 - migrate_set_parameter_int(from, "max-bandwidth", 1000000); /* ~1Mb/s */ 1214 + migrate_set_parameter_int(from, "max-bandwidth", 100000000); /* ~100Mb/s */ 1215 1215 1216 1216 /* To check remaining size after precopy */ 1217 1217 migrate_set_capability(from, "pause-before-switchover", true);