···353353 * a platform-dependent stride. On top of that the memory can apply
354354 * platform-depending swizzling of some higher address bits into bit6.
355355 *
356356- * This format is highly platforms specific and not useful for cross-driver
357357- * sharing. It exists since on a given platform it does uniquely identify the
358358- * layout in a simple way for i915-specific userspace.
356356+ * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets.
357357+ * On earlier platforms the is highly platforms specific and not useful for
358358+ * cross-driver sharing. It exists since on a given platform it does uniquely
359359+ * identify the layout in a simple way for i915-specific userspace, which
360360+ * facilitated conversion of userspace to modifiers. Additionally the exact
361361+ * format on some really old platforms is not known.
359362 */
360363#define I915_FORMAT_MOD_X_TILED fourcc_mod_code(INTEL, 1)
361364···368371 * memory can apply platform-depending swizzling of some higher address bits
369372 * into bit6.
370373 *
371371- * This format is highly platforms specific and not useful for cross-driver
372372- * sharing. It exists since on a given platform it does uniquely identify the
373373- * layout in a simple way for i915-specific userspace.
374374+ * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets.
375375+ * On earlier platforms the is highly platforms specific and not useful for
376376+ * cross-driver sharing. It exists since on a given platform it does uniquely
377377+ * identify the layout in a simple way for i915-specific userspace, which
378378+ * facilitated conversion of userspace to modifiers. Additionally the exact
379379+ * format on some really old platforms is not known.
374380 */
375381#define I915_FORMAT_MOD_Y_TILED fourcc_mod_code(INTEL, 2)
376382···520526#define DRM_FORMAT_MOD_NVIDIA_TEGRA_TILED fourcc_mod_code(NVIDIA, 1)
521527522528/*
523523- * 16Bx2 Block Linear layout, used by desktop GPUs, and Tegra K1 and later
529529+ * Generalized Block Linear layout, used by desktop GPUs starting with NV50/G80,
530530+ * and Tegra GPUs starting with Tegra K1.
531531+ *
532532+ * Pixels are arranged in Groups of Bytes (GOBs). GOB size and layout varies
533533+ * based on the architecture generation. GOBs themselves are then arranged in
534534+ * 3D blocks, with the block dimensions (in terms of GOBs) always being a power
535535+ * of two, and hence expressible as their log2 equivalent (E.g., "2" represents
536536+ * a block depth or height of "4").
537537+ *
538538+ * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format
539539+ * in full detail.
540540+ *
541541+ * Macro
542542+ * Bits Param Description
543543+ * ---- ----- -----------------------------------------------------------------
544544+ *
545545+ * 3:0 h log2(height) of each block, in GOBs. Placed here for
546546+ * compatibility with the existing
547547+ * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers.
548548+ *
549549+ * 4:4 - Must be 1, to indicate block-linear layout. Necessary for
550550+ * compatibility with the existing
551551+ * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers.
552552+ *
553553+ * 8:5 - Reserved (To support 3D-surfaces with variable log2(depth) block
554554+ * size). Must be zero.
555555+ *
556556+ * Note there is no log2(width) parameter. Some portions of the
557557+ * hardware support a block width of two gobs, but it is impractical
558558+ * to use due to lack of support elsewhere, and has no known
559559+ * benefits.
560560+ *
561561+ * 11:9 - Reserved (To support 2D-array textures with variable array stride
562562+ * in blocks, specified via log2(tile width in blocks)). Must be
563563+ * zero.
564564+ *
565565+ * 19:12 k Page Kind. This value directly maps to a field in the page
566566+ * tables of all GPUs >= NV50. It affects the exact layout of bits
567567+ * in memory and can be derived from the tuple
568568+ *
569569+ * (format, GPU model, compression type, samples per pixel)
570570+ *
571571+ * Where compression type is defined below. If GPU model were
572572+ * implied by the format modifier, format, or memory buffer, page
573573+ * kind would not need to be included in the modifier itself, but
574574+ * since the modifier should define the layout of the associated
575575+ * memory buffer independent from any device or other context, it
576576+ * must be included here.
577577+ *
578578+ * 21:20 g GOB Height and Page Kind Generation. The height of a GOB changed
579579+ * starting with Fermi GPUs. Additionally, the mapping between page
580580+ * kind and bit layout has changed at various points.
581581+ *
582582+ * 0 = Gob Height 8, Fermi - Volta, Tegra K1+ Page Kind mapping
583583+ * 1 = Gob Height 4, G80 - GT2XX Page Kind mapping
584584+ * 2 = Gob Height 8, Turing+ Page Kind mapping
585585+ * 3 = Reserved for future use.
586586+ *
587587+ * 22:22 s Sector layout. On Tegra GPUs prior to Xavier, there is a further
588588+ * bit remapping step that occurs at an even lower level than the
589589+ * page kind and block linear swizzles. This causes the layout of
590590+ * surfaces mapped in those SOC's GPUs to be incompatible with the
591591+ * equivalent mapping on other GPUs in the same system.
592592+ *
593593+ * 0 = Tegra K1 - Tegra Parker/TX2 Layout.
594594+ * 1 = Desktop GPU and Tegra Xavier+ Layout
595595+ *
596596+ * 25:23 c Lossless Framebuffer Compression type.
597597+ *
598598+ * 0 = none
599599+ * 1 = ROP/3D, layout 1, exact compression format implied by Page
600600+ * Kind field
601601+ * 2 = ROP/3D, layout 2, exact compression format implied by Page
602602+ * Kind field
603603+ * 3 = CDE horizontal
604604+ * 4 = CDE vertical
605605+ * 5 = Reserved for future use
606606+ * 6 = Reserved for future use
607607+ * 7 = Reserved for future use
608608+ *
609609+ * 55:25 - Reserved for future use. Must be zero.
610610+ */
611611+#define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \
612612+ fourcc_mod_code(NVIDIA, (0x10 | \
613613+ ((h) & 0xf) | \
614614+ (((k) & 0xff) << 12) | \
615615+ (((g) & 0x3) << 20) | \
616616+ (((s) & 0x1) << 22) | \
617617+ (((c) & 0x7) << 23)))
618618+619619+/* To grandfather in prior block linear format modifiers to the above layout,
620620+ * the page kind "0", which corresponds to "pitch/linear" and hence is unusable
621621+ * with block-linear layouts, is remapped within drivers to the value 0xfe,
622622+ * which corresponds to the "generic" kind used for simple single-sample
623623+ * uncompressed color formats on Fermi - Volta GPUs.
624624+ */
625625+static inline uint64_t
626626+drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier)
627627+{
628628+ if (!(modifier & 0x10) || (modifier & (0xff << 12)))
629629+ return modifier;
630630+ else
631631+ return modifier | (0xfe << 12);
632632+}
633633+634634+/*
635635+ * 16Bx2 Block Linear layout, used by Tegra K1 and later
524636 *
525637 * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked
526638 * vertically by a power of 2 (1 to 32 GOBs) to form a block.
···541653 * in full detail.
542654 */
543655#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(v) \
544544- fourcc_mod_code(NVIDIA, 0x10 | ((v) & 0xf))
656656+ DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 0, 0, 0, (v))
545657546658#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_ONE_GOB \
547547- fourcc_mod_code(NVIDIA, 0x10)
659659+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(0)
548660#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_TWO_GOB \
549549- fourcc_mod_code(NVIDIA, 0x11)
661661+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(1)
550662#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_FOUR_GOB \
551551- fourcc_mod_code(NVIDIA, 0x12)
663663+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(2)
552664#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_EIGHT_GOB \
553553- fourcc_mod_code(NVIDIA, 0x13)
665665+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(3)
554666#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_SIXTEEN_GOB \
555555- fourcc_mod_code(NVIDIA, 0x14)
667667+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(4)
556668#define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_THIRTYTWO_GOB \
557557- fourcc_mod_code(NVIDIA, 0x15)
669669+ DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(5)
558670559671/*
560672 * Some Broadcom modifiers take parameters, for example the number of
···11+/* SPDX-License-Identifier: BSD-3-Clause */
22+/*
33+ * Virtio Mem Device
44+ *
55+ * Copyright Red Hat, Inc. 2020
66+ *
77+ * Authors:
88+ * David Hildenbrand <david@redhat.com>
99+ *
1010+ * This header is BSD licensed so anyone can use the definitions
1111+ * to implement compatible drivers/servers:
1212+ *
1313+ * Redistribution and use in source and binary forms, with or without
1414+ * modification, are permitted provided that the following conditions
1515+ * are met:
1616+ * 1. Redistributions of source code must retain the above copyright
1717+ * notice, this list of conditions and the following disclaimer.
1818+ * 2. Redistributions in binary form must reproduce the above copyright
1919+ * notice, this list of conditions and the following disclaimer in the
2020+ * documentation and/or other materials provided with the distribution.
2121+ * 3. Neither the name of IBM nor the names of its contributors
2222+ * may be used to endorse or promote products derived from this software
2323+ * without specific prior written permission.
2424+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2525+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2626+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
2727+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR
2828+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
2929+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
3030+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
3131+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3232+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
3333+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
3434+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3535+ * SUCH DAMAGE.
3636+ */
3737+3838+#ifndef _LINUX_VIRTIO_MEM_H
3939+#define _LINUX_VIRTIO_MEM_H
4040+4141+#include "standard-headers/linux/types.h"
4242+#include "standard-headers/linux/virtio_types.h"
4343+#include "standard-headers/linux/virtio_ids.h"
4444+#include "standard-headers/linux/virtio_config.h"
4545+4646+/*
4747+ * Each virtio-mem device manages a dedicated region in physical address
4848+ * space. Each device can belong to a single NUMA node, multiple devices
4949+ * for a single NUMA node are possible. A virtio-mem device is like a
5050+ * "resizable DIMM" consisting of small memory blocks that can be plugged
5151+ * or unplugged. The device driver is responsible for (un)plugging memory
5252+ * blocks on demand.
5353+ *
5454+ * Virtio-mem devices can only operate on their assigned memory region in
5555+ * order to (un)plug memory. A device cannot (un)plug memory belonging to
5656+ * other devices.
5757+ *
5858+ * The "region_size" corresponds to the maximum amount of memory that can
5959+ * be provided by a device. The "size" corresponds to the amount of memory
6060+ * that is currently plugged. "requested_size" corresponds to a request
6161+ * from the device to the device driver to (un)plug blocks. The
6262+ * device driver should try to (un)plug blocks in order to reach the
6363+ * "requested_size". It is impossible to plug more memory than requested.
6464+ *
6565+ * The "usable_region_size" represents the memory region that can actually
6666+ * be used to (un)plug memory. It is always at least as big as the
6767+ * "requested_size" and will grow dynamically. It will only shrink when
6868+ * explicitly triggered (VIRTIO_MEM_REQ_UNPLUG).
6969+ *
7070+ * There are no guarantees what will happen if unplugged memory is
7171+ * read/written. Such memory should, in general, not be touched. E.g.,
7272+ * even writing might succeed, but the values will simply be discarded at
7373+ * random points in time.
7474+ *
7575+ * It can happen that the device cannot process a request, because it is
7676+ * busy. The device driver has to retry later.
7777+ *
7878+ * Usually, during system resets all memory will get unplugged, so the
7979+ * device driver can start with a clean state. However, in specific
8080+ * scenarios (if the device is busy) it can happen that the device still
8181+ * has memory plugged. The device driver can request to unplug all memory
8282+ * (VIRTIO_MEM_REQ_UNPLUG) - which might take a while to succeed if the
8383+ * device is busy.
8484+ */
8585+8686+/* --- virtio-mem: feature bits --- */
8787+8888+/* node_id is an ACPI PXM and is valid */
8989+#define VIRTIO_MEM_F_ACPI_PXM 0
9090+9191+9292+/* --- virtio-mem: guest -> host requests --- */
9393+9494+/* request to plug memory blocks */
9595+#define VIRTIO_MEM_REQ_PLUG 0
9696+/* request to unplug memory blocks */
9797+#define VIRTIO_MEM_REQ_UNPLUG 1
9898+/* request to unplug all blocks and shrink the usable size */
9999+#define VIRTIO_MEM_REQ_UNPLUG_ALL 2
100100+/* request information about the plugged state of memory blocks */
101101+#define VIRTIO_MEM_REQ_STATE 3
102102+103103+struct virtio_mem_req_plug {
104104+ __virtio64 addr;
105105+ __virtio16 nb_blocks;
106106+ __virtio16 padding[3];
107107+};
108108+109109+struct virtio_mem_req_unplug {
110110+ __virtio64 addr;
111111+ __virtio16 nb_blocks;
112112+ __virtio16 padding[3];
113113+};
114114+115115+struct virtio_mem_req_state {
116116+ __virtio64 addr;
117117+ __virtio16 nb_blocks;
118118+ __virtio16 padding[3];
119119+};
120120+121121+struct virtio_mem_req {
122122+ __virtio16 type;
123123+ __virtio16 padding[3];
124124+125125+ union {
126126+ struct virtio_mem_req_plug plug;
127127+ struct virtio_mem_req_unplug unplug;
128128+ struct virtio_mem_req_state state;
129129+ } u;
130130+};
131131+132132+133133+/* --- virtio-mem: host -> guest response --- */
134134+135135+/*
136136+ * Request processed successfully, applicable for
137137+ * - VIRTIO_MEM_REQ_PLUG
138138+ * - VIRTIO_MEM_REQ_UNPLUG
139139+ * - VIRTIO_MEM_REQ_UNPLUG_ALL
140140+ * - VIRTIO_MEM_REQ_STATE
141141+ */
142142+#define VIRTIO_MEM_RESP_ACK 0
143143+/*
144144+ * Request denied - e.g. trying to plug more than requested, applicable for
145145+ * - VIRTIO_MEM_REQ_PLUG
146146+ */
147147+#define VIRTIO_MEM_RESP_NACK 1
148148+/*
149149+ * Request cannot be processed right now, try again later, applicable for
150150+ * - VIRTIO_MEM_REQ_PLUG
151151+ * - VIRTIO_MEM_REQ_UNPLUG
152152+ * - VIRTIO_MEM_REQ_UNPLUG_ALL
153153+ */
154154+#define VIRTIO_MEM_RESP_BUSY 2
155155+/*
156156+ * Error in request (e.g. addresses/alignment), applicable for
157157+ * - VIRTIO_MEM_REQ_PLUG
158158+ * - VIRTIO_MEM_REQ_UNPLUG
159159+ * - VIRTIO_MEM_REQ_STATE
160160+ */
161161+#define VIRTIO_MEM_RESP_ERROR 3
162162+163163+164164+/* State of memory blocks is "plugged" */
165165+#define VIRTIO_MEM_STATE_PLUGGED 0
166166+/* State of memory blocks is "unplugged" */
167167+#define VIRTIO_MEM_STATE_UNPLUGGED 1
168168+/* State of memory blocks is "mixed" */
169169+#define VIRTIO_MEM_STATE_MIXED 2
170170+171171+struct virtio_mem_resp_state {
172172+ __virtio16 state;
173173+};
174174+175175+struct virtio_mem_resp {
176176+ __virtio16 type;
177177+ __virtio16 padding[3];
178178+179179+ union {
180180+ struct virtio_mem_resp_state state;
181181+ } u;
182182+};
183183+184184+/* --- virtio-mem: configuration --- */
185185+186186+struct virtio_mem_config {
187187+ /* Block size and alignment. Cannot change. */
188188+ uint64_t block_size;
189189+ /* Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. */
190190+ uint16_t node_id;
191191+ uint8_t padding[6];
192192+ /* Start address of the memory region. Cannot change. */
193193+ uint64_t addr;
194194+ /* Region size (maximum). Cannot change. */
195195+ uint64_t region_size;
196196+ /*
197197+ * Currently usable region size. Can grow up to region_size. Can
198198+ * shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config
199199+ * update will be sent).
200200+ */
201201+ uint64_t usable_region_size;
202202+ /*
203203+ * Currently used size. Changes due to plug/unplug requests, but no
204204+ * config updates will be sent.
205205+ */
206206+ uint64_t plugged_size;
207207+ /* Requested size. New plug requests cannot exceed it. Can change. */
208208+ uint64_t requested_size;
209209+};
210210+211211+#endif /* _LINUX_VIRTIO_MEM_H */
+38-10
include/standard-headers/linux/virtio_ring.h
···8484 * at the end of the used ring. Guest should ignore the used->flags field. */
8585#define VIRTIO_RING_F_EVENT_IDX 29
86868787+/* Alignment requirements for vring elements.
8888+ * When using pre-virtio 1.0 layout, these fall out naturally.
8989+ */
9090+#define VRING_AVAIL_ALIGN_SIZE 2
9191+#define VRING_USED_ALIGN_SIZE 4
9292+#define VRING_DESC_ALIGN_SIZE 16
9393+8794/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
8895struct vring_desc {
8996 /* Address (guest-physical). */
···110117 __virtio32 len;
111118};
112119120120+typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE)))
121121+ vring_used_elem_t;
122122+113123struct vring_used {
114124 __virtio16 flags;
115125 __virtio16 idx;
116116- struct vring_used_elem ring[];
126126+ vring_used_elem_t ring[];
117127};
118128129129+/*
130130+ * The ring element addresses are passed between components with different
131131+ * alignments assumptions. Thus, we might need to decrease the compiler-selected
132132+ * alignment, and so must use a typedef to make sure the aligned attribute
133133+ * actually takes hold:
134134+ *
135135+ * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes
136136+ *
137137+ * When used on a struct, or struct member, the aligned attribute can only
138138+ * increase the alignment; in order to decrease it, the packed attribute must
139139+ * be specified as well. When used as part of a typedef, the aligned attribute
140140+ * can both increase and decrease alignment, and specifying the packed
141141+ * attribute generates a warning.
142142+ */
143143+typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE)))
144144+ vring_desc_t;
145145+typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE)))
146146+ vring_avail_t;
147147+typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE)))
148148+ vring_used_t;
149149+119150struct vring {
120151 unsigned int num;
121152122122- struct vring_desc *desc;
153153+ vring_desc_t *desc;
123154124124- struct vring_avail *avail;
155155+ vring_avail_t *avail;
125156126126- struct vring_used *used;
157157+ vring_used_t *used;
127158};
128159129129-/* Alignment requirements for vring elements.
130130- * When using pre-virtio 1.0 layout, these fall out naturally.
131131- */
132132-#define VRING_AVAIL_ALIGN_SIZE 2
133133-#define VRING_USED_ALIGN_SIZE 4
134134-#define VRING_DESC_ALIGN_SIZE 16
160160+#ifndef VIRTIO_RING_NO_LEGACY
135161136162/* The standard layout for the ring is a continuous chunk of memory which looks
137163 * like this. We assume num is a power of 2.
···178204 + align - 1) & ~(align - 1))
179205 + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;
180206}
207207+208208+#endif /* VIRTIO_RING_NO_LEGACY */
181209182210/* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */
183211/* Assuming a given event_idx value from the other side, if
···22#ifndef _ASM_X86_UNISTD_H
33#define _ASM_X86_UNISTD_H
4455-/* x32 syscall flag bit */
66-#define __X32_SYSCALL_BIT 0x40000000UL
55+/*
66+ * x32 syscall flag bit. Some user programs expect syscall NR macros
77+ * and __X32_SYSCALL_BIT to have type int, even though syscall numbers
88+ * are, for practical purposes, unsigned long.
99+ *
1010+ * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right
1111+ * thing regardless.
1212+ */
1313+#define __X32_SYSCALL_BIT 0x40000000
714815# ifdef __i386__
916# include <asm/unistd_32.h>
···305305#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
306306#define VFIO_REGION_TYPE_GFX (1)
307307#define VFIO_REGION_TYPE_CCW (2)
308308+#define VFIO_REGION_TYPE_MIGRATION (3)
308309309310/* sub-types for VFIO_REGION_TYPE_PCI_* */
310311···378379379380/* sub-types for VFIO_REGION_TYPE_CCW */
380381#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1)
382382+#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2)
383383+#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
384384+385385+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
386386+#define VFIO_REGION_SUBTYPE_MIGRATION (1)
387387+388388+/*
389389+ * The structure vfio_device_migration_info is placed at the 0th offset of
390390+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
391391+ * migration information. Field accesses from this structure are only supported
392392+ * at their native width and alignment. Otherwise, the result is undefined and
393393+ * vendor drivers should return an error.
394394+ *
395395+ * device_state: (read/write)
396396+ * - The user application writes to this field to inform the vendor driver
397397+ * about the device state to be transitioned to.
398398+ * - The vendor driver should take the necessary actions to change the
399399+ * device state. After successful transition to a given state, the
400400+ * vendor driver should return success on write(device_state, state)
401401+ * system call. If the device state transition fails, the vendor driver
402402+ * should return an appropriate -errno for the fault condition.
403403+ * - On the user application side, if the device state transition fails,
404404+ * that is, if write(device_state, state) returns an error, read
405405+ * device_state again to determine the current state of the device from
406406+ * the vendor driver.
407407+ * - The vendor driver should return previous state of the device unless
408408+ * the vendor driver has encountered an internal error, in which case
409409+ * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
410410+ * - The user application must use the device reset ioctl to recover the
411411+ * device from VFIO_DEVICE_STATE_ERROR state. If the device is
412412+ * indicated to be in a valid device state by reading device_state, the
413413+ * user application may attempt to transition the device to any valid
414414+ * state reachable from the current state or terminate itself.
415415+ *
416416+ * device_state consists of 3 bits:
417417+ * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
418418+ * it indicates the _STOP state. When the device state is changed to
419419+ * _STOP, driver should stop the device before write() returns.
420420+ * - If bit 1 is set, it indicates the _SAVING state, which means that the
421421+ * driver should start gathering device state information that will be
422422+ * provided to the VFIO user application to save the device's state.
423423+ * - If bit 2 is set, it indicates the _RESUMING state, which means that
424424+ * the driver should prepare to resume the device. Data provided through
425425+ * the migration region should be used to resume the device.
426426+ * Bits 3 - 31 are reserved for future use. To preserve them, the user
427427+ * application should perform a read-modify-write operation on this
428428+ * field when modifying the specified bits.
429429+ *
430430+ * +------- _RESUMING
431431+ * |+------ _SAVING
432432+ * ||+----- _RUNNING
433433+ * |||
434434+ * 000b => Device Stopped, not saving or resuming
435435+ * 001b => Device running, which is the default state
436436+ * 010b => Stop the device & save the device state, stop-and-copy state
437437+ * 011b => Device running and save the device state, pre-copy state
438438+ * 100b => Device stopped and the device state is resuming
439439+ * 101b => Invalid state
440440+ * 110b => Error state
441441+ * 111b => Invalid state
442442+ *
443443+ * State transitions:
444444+ *
445445+ * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
446446+ * (100b) (001b) (011b) (010b) (000b)
447447+ * 0. Running or default state
448448+ * |
449449+ *
450450+ * 1. Normal Shutdown (optional)
451451+ * |------------------------------------->|
452452+ *
453453+ * 2. Save the state or suspend
454454+ * |------------------------->|---------->|
455455+ *
456456+ * 3. Save the state during live migration
457457+ * |----------->|------------>|---------->|
458458+ *
459459+ * 4. Resuming
460460+ * |<---------|
461461+ *
462462+ * 5. Resumed
463463+ * |--------->|
464464+ *
465465+ * 0. Default state of VFIO device is _RUNNNG when the user application starts.
466466+ * 1. During normal shutdown of the user application, the user application may
467467+ * optionally change the VFIO device state from _RUNNING to _STOP. This
468468+ * transition is optional. The vendor driver must support this transition but
469469+ * must not require it.
470470+ * 2. When the user application saves state or suspends the application, the
471471+ * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
472472+ * On state transition from _RUNNING to stop-and-copy, driver must stop the
473473+ * device, save the device state and send it to the application through the
474474+ * migration region. The sequence to be followed for such transition is given
475475+ * below.
476476+ * 3. In live migration of user application, the state transitions from _RUNNING
477477+ * to pre-copy, to stop-and-copy, and to _STOP.
478478+ * On state transition from _RUNNING to pre-copy, the driver should start
479479+ * gathering the device state while the application is still running and send
480480+ * the device state data to application through the migration region.
481481+ * On state transition from pre-copy to stop-and-copy, the driver must stop
482482+ * the device, save the device state and send it to the user application
483483+ * through the migration region.
484484+ * Vendor drivers must support the pre-copy state even for implementations
485485+ * where no data is provided to the user before the stop-and-copy state. The
486486+ * user must not be required to consume all migration data before the device
487487+ * transitions to a new state, including the stop-and-copy state.
488488+ * The sequence to be followed for above two transitions is given below.
489489+ * 4. To start the resuming phase, the device state should be transitioned from
490490+ * the _RUNNING to the _RESUMING state.
491491+ * In the _RESUMING state, the driver should use the device state data
492492+ * received through the migration region to resume the device.
493493+ * 5. After providing saved device data to the driver, the application should
494494+ * change the state from _RESUMING to _RUNNING.
495495+ *
496496+ * reserved:
497497+ * Reads on this field return zero and writes are ignored.
498498+ *
499499+ * pending_bytes: (read only)
500500+ * The number of pending bytes still to be migrated from the vendor driver.
501501+ *
502502+ * data_offset: (read only)
503503+ * The user application should read data_offset field from the migration
504504+ * region. The user application should read the device data from this
505505+ * offset within the migration region during the _SAVING state or write
506506+ * the device data during the _RESUMING state. See below for details of
507507+ * sequence to be followed.
508508+ *
509509+ * data_size: (read/write)
510510+ * The user application should read data_size to get the size in bytes of
511511+ * the data copied in the migration region during the _SAVING state and
512512+ * write the size in bytes of the data copied in the migration region
513513+ * during the _RESUMING state.
514514+ *
515515+ * The format of the migration region is as follows:
516516+ * ------------------------------------------------------------------
517517+ * |vfio_device_migration_info| data section |
518518+ * | | /////////////////////////////// |
519519+ * ------------------------------------------------------------------
520520+ * ^ ^
521521+ * offset 0-trapped part data_offset
522522+ *
523523+ * The structure vfio_device_migration_info is always followed by the data
524524+ * section in the region, so data_offset will always be nonzero. The offset
525525+ * from where the data is copied is decided by the kernel driver. The data
526526+ * section can be trapped, mmapped, or partitioned, depending on how the kernel
527527+ * driver defines the data section. The data section partition can be defined
528528+ * as mapped by the sparse mmap capability. If mmapped, data_offset must be
529529+ * page aligned, whereas initial section which contains the
530530+ * vfio_device_migration_info structure, might not end at the offset, which is
531531+ * page aligned. The user is not required to access through mmap regardless
532532+ * of the capabilities of the region mmap.
533533+ * The vendor driver should determine whether and how to partition the data
534534+ * section. The vendor driver should return data_offset accordingly.
535535+ *
536536+ * The sequence to be followed while in pre-copy state and stop-and-copy state
537537+ * is as follows:
538538+ * a. Read pending_bytes, indicating the start of a new iteration to get device
539539+ * data. Repeated read on pending_bytes at this stage should have no side
540540+ * effects.
541541+ * If pending_bytes == 0, the user application should not iterate to get data
542542+ * for that device.
543543+ * If pending_bytes > 0, perform the following steps.
544544+ * b. Read data_offset, indicating that the vendor driver should make data
545545+ * available through the data section. The vendor driver should return this
546546+ * read operation only after data is available from (region + data_offset)
547547+ * to (region + data_offset + data_size).
548548+ * c. Read data_size, which is the amount of data in bytes available through
549549+ * the migration region.
550550+ * Read on data_offset and data_size should return the offset and size of
551551+ * the current buffer if the user application reads data_offset and
552552+ * data_size more than once here.
553553+ * d. Read data_size bytes of data from (region + data_offset) from the
554554+ * migration region.
555555+ * e. Process the data.
556556+ * f. Read pending_bytes, which indicates that the data from the previous
557557+ * iteration has been read. If pending_bytes > 0, go to step b.
558558+ *
559559+ * The user application can transition from the _SAVING|_RUNNING
560560+ * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
561561+ * number of pending bytes. The user application should iterate in _SAVING
562562+ * (stop-and-copy) until pending_bytes is 0.
563563+ *
564564+ * The sequence to be followed while _RESUMING device state is as follows:
565565+ * While data for this device is available, repeat the following steps:
566566+ * a. Read data_offset from where the user application should write data.
567567+ * b. Write migration data starting at the migration region + data_offset for
568568+ * the length determined by data_size from the migration source.
569569+ * c. Write data_size, which indicates to the vendor driver that data is
570570+ * written in the migration region. Vendor driver must return this write
571571+ * operations on consuming data. Vendor driver should apply the
572572+ * user-provided migration region data to the device resume state.
573573+ *
574574+ * If an error occurs during the above sequences, the vendor driver can return
575575+ * an error code for next read() or write() operation, which will terminate the
576576+ * loop. The user application should then take the next necessary action, for
577577+ * example, failing migration or terminating the user application.
578578+ *
579579+ * For the user application, data is opaque. The user application should write
580580+ * data in the same order as the data is received and the data should be of
581581+ * same transaction size at the source.
582582+ */
583583+584584+struct vfio_device_migration_info {
585585+ __u32 device_state; /* VFIO device state */
586586+#define VFIO_DEVICE_STATE_STOP (0)
587587+#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
588588+#define VFIO_DEVICE_STATE_SAVING (1 << 1)
589589+#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
590590+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
591591+ VFIO_DEVICE_STATE_SAVING | \
592592+ VFIO_DEVICE_STATE_RESUMING)
593593+594594+#define VFIO_DEVICE_STATE_VALID(state) \
595595+ (state & VFIO_DEVICE_STATE_RESUMING ? \
596596+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
597597+598598+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
599599+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
600600+ VFIO_DEVICE_STATE_RESUMING))
601601+602602+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
603603+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
604604+ VFIO_DEVICE_STATE_RESUMING)
605605+606606+ __u32 reserved;
607607+ __u64 pending_bytes;
608608+ __u64 data_offset;
609609+ __u64 data_size;
610610+};
381611382612/*
383613 * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
···577807578808enum {
579809 VFIO_CCW_IO_IRQ_INDEX,
810810+ VFIO_CCW_CRW_IRQ_INDEX,
580811 VFIO_CCW_NUM_IRQS
581812};
582813···7851016 struct vfio_iova_range iova_ranges[];
7861017};
787101810191019+/*
10201020+ * The migration capability allows to report supported features for migration.
10211021+ *
10221022+ * The structures below define version 1 of this capability.
10231023+ *
10241024+ * The existence of this capability indicates that IOMMU kernel driver supports
10251025+ * dirty page logging.
10261026+ *
10271027+ * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty
10281028+ * page logging.
10291029+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
10301030+ * size in bytes that can be used by user applications when getting the dirty
10311031+ * bitmap.
10321032+ */
10331033+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 1
10341034+10351035+struct vfio_iommu_type1_info_cap_migration {
10361036+ struct vfio_info_cap_header header;
10371037+ __u32 flags;
10381038+ __u64 pgsize_bitmap;
10391039+ __u64 max_dirty_bitmap_size; /* in bytes */
10401040+};
10411041+7881042#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
78910437901044/**
···80510598061060#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
807106110621062+struct vfio_bitmap {
10631063+ __u64 pgsize; /* page size for bitmap in bytes */
10641064+ __u64 size; /* in bytes */
10651065+ __u64 *data; /* one bit per page */
10661066+};
10671067+8081068/**
8091069 * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
8101070 * struct vfio_dma_unmap)
···8141074 * field. No guarantee is made to the user that arbitrary unmaps of iova
8151075 * or size different from those used in the original mapping call will
8161076 * succeed.
10771077+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
10781078+ * before unmapping IO virtual addresses. When this flag is set, the user must
10791079+ * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
10801080+ * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field.
10811081+ * A bit in the bitmap represents one page, of user provided page size in
10821082+ * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set
10831083+ * indicates that the page at that offset from iova is dirty. A Bitmap of the
10841084+ * pages in the range of unmapped size is returned in the user-provided
10851085+ * vfio_bitmap.data.
8171086 */
8181087struct vfio_iommu_type1_dma_unmap {
8191088 __u32 argsz;
8201089 __u32 flags;
10901090+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
8211091 __u64 iova; /* IO virtual address */
8221092 __u64 size; /* Size of mapping (bytes) */
10931093+ __u8 data[];
8231094};
82410958251096#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
···8301101 */
8311102#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
8321103#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
11041104+11051105+/**
11061106+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
11071107+ * struct vfio_iommu_type1_dirty_bitmap)
11081108+ * IOCTL is used for dirty pages logging.
11091109+ * Caller should set flag depending on which operation to perform, details as
11101110+ * below:
11111111+ *
11121112+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
11131113+ * the IOMMU driver to log pages that are dirtied or potentially dirtied by
11141114+ * the device; designed to be used when a migration is in progress. Dirty pages
11151115+ * are logged until logging is disabled by user application by calling the IOCTL
11161116+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
11171117+ *
11181118+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
11191119+ * the IOMMU driver to stop logging dirtied pages.
11201120+ *
11211121+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
11221122+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
11231123+ * The user must specify the IOVA range and the pgsize through the structure
11241124+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
11251125+ * supports getting a bitmap of the smallest supported pgsize only and can be
11261126+ * modified in future to get a bitmap of any specified supported pgsize. The
11271127+ * user must provide a zeroed memory area for the bitmap memory and specify its
11281128+ * size in bitmap.size. One bit is used to represent one page consecutively
11291129+ * starting from iova offset. The user should provide page size in bitmap.pgsize
11301130+ * field. A bit set in the bitmap indicates that the page at that offset from
11311131+ * iova is dirty. The caller must set argsz to a value including the size of
11321132+ * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the
11331133+ * actual bitmap. If dirty pages logging is not enabled, an error will be
11341134+ * returned.
11351135+ *
11361136+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
11371137+ *
11381138+ */
11391139+struct vfio_iommu_type1_dirty_bitmap {
11401140+ __u32 argsz;
11411141+ __u32 flags;
11421142+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
11431143+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
11441144+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
11451145+ __u8 data[];
11461146+};
11471147+11481148+struct vfio_iommu_type1_dirty_bitmap_get {
11491149+ __u64 iova; /* IO virtual address */
11501150+ __u64 size; /* Size of iova range */
11511151+ struct vfio_bitmap bitmap;
11521152+};
11531153+11541154+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
83311558341156/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
8351157
+19
linux-headers/linux/vfio_ccw.h
···3434 __u32 ret_code;
3535} __attribute__((packed));
36363737+/*
3838+ * Used for processing commands that read the subchannel-information block
3939+ * Reading this region triggers a stsch() to hardware
4040+ * Note: this is controlled by a capability
4141+ */
4242+struct ccw_schib_region {
4343+#define SCHIB_AREA_SIZE 52
4444+ __u8 schib_area[SCHIB_AREA_SIZE];
4545+} __attribute__((packed));
4646+4747+/*
4848+ * Used for returning a Channel Report Word to userspace.
4949+ * Note: this is controlled by a capability
5050+ */
5151+struct ccw_crw_region {
5252+ __u32 crw;
5353+ __u32 pad;
5454+} __attribute__((packed));
5555+3756#endif
+4
linux-headers/linux/vhost.h
···1515#include <linux/types.h>
1616#include <linux/ioctl.h>
17171818+#define VHOST_FILE_UNBIND -1
1919+1820/* ioctls */
19212022#define VHOST_VIRTIO 0xAF
···140142/* Get the max ring size. */
141143#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16)
142144145145+/* Set event fd for config interrupt*/
146146+#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int)
143147#endif