qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

Merge remote-tracking branch 'remotes/marcel/tags/rdma-pull-request' into staging

PVRDMA implementation

# gpg: Signature made Mon 19 Feb 2018 11:08:49 GMT
# gpg: using RSA key 36D4C0F0CF2FE46D
# gpg: Good signature from "Marcel Apfelbaum <marcel@redhat.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg: It is not certain that the signature belongs to the owner.
# Primary key fingerprint: B1C6 3A57 F92E 08F2 640F 31F5 36D4 C0F0 CF2F E46D

* remotes/marcel/tags/rdma-pull-request:
MAINTAINERS: add entry for hw/rdma
hw/rdma: Implementation of PVRDMA device
hw/rdma: PVRDMA commands and data-path ops
hw/rdma: Implementation of generic rdma device layers
hw/rdma: Definitions for rdma device and rdma resource manager
hw/rdma: Add wrappers and macros
include/standard-headers: add pvrdma related headers
scripts/update-linux-headers: import pvrdma headers
docs: add pvrdma device documentation.
mem: add share parameter to memory-backend-ram

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

+5570 -54
+8
MAINTAINERS
··· 2034 2034 F: tests/test-replication.c 2035 2035 F: docs/block-replication.txt 2036 2036 2037 + PVRDMA 2038 + M: Yuval Shaia <yuval.shaia@oracle.com> 2039 + M: Marcel Apfelbaum <marcel@redhat.com> 2040 + S: Maintained 2041 + F: hw/rdma/* 2042 + F: hw/rdma/vmw/* 2043 + F: docs/pvrdma.txt 2044 + 2037 2045 Build and test automation 2038 2046 ------------------------- 2039 2047 Build and test automation
+2
Makefile.objs
··· 130 130 trace-events-subdirs += hw/char 131 131 trace-events-subdirs += hw/intc 132 132 trace-events-subdirs += hw/net 133 + trace-events-subdirs += hw/rdma 134 + trace-events-subdirs += hw/rdma/vmw 133 135 trace-events-subdirs += hw/virtio 134 136 trace-events-subdirs += hw/audio 135 137 trace-events-subdirs += hw/misc
+1 -24
backends/hostmem-file.c
··· 31 31 struct HostMemoryBackendFile { 32 32 HostMemoryBackend parent_obj; 33 33 34 - bool share; 35 34 bool discard_data; 36 35 char *mem_path; 37 36 uint64_t align; ··· 59 58 path = object_get_canonical_path(OBJECT(backend)); 60 59 memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), 61 60 path, 62 - backend->size, fb->align, fb->share, 61 + backend->size, fb->align, backend->share, 63 62 fb->mem_path, errp); 64 63 g_free(path); 65 64 } ··· 86 85 fb->mem_path = g_strdup(str); 87 86 } 88 87 89 - static bool file_memory_backend_get_share(Object *o, Error **errp) 90 - { 91 - HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); 92 - 93 - return fb->share; 94 - } 95 - 96 - static void file_memory_backend_set_share(Object *o, bool value, Error **errp) 97 - { 98 - HostMemoryBackend *backend = MEMORY_BACKEND(o); 99 - HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); 100 - 101 - if (host_memory_backend_mr_inited(backend)) { 102 - error_setg(errp, "cannot change property value"); 103 - return; 104 - } 105 - fb->share = value; 106 - } 107 - 108 88 static bool file_memory_backend_get_discard_data(Object *o, Error **errp) 109 89 { 110 90 return MEMORY_BACKEND_FILE(o)->discard_data; ··· 171 151 bc->alloc = file_backend_memory_alloc; 172 152 oc->unparent = file_backend_unparent; 173 153 174 - object_class_property_add_bool(oc, "share", 175 - file_memory_backend_get_share, file_memory_backend_set_share, 176 - &error_abort); 177 154 object_class_property_add_bool(oc, "discard-data", 178 155 file_memory_backend_get_discard_data, file_memory_backend_set_discard_data, 179 156 &error_abort);
+2 -2
backends/hostmem-ram.c
··· 28 28 } 29 29 30 30 path = object_get_canonical_path_component(OBJECT(backend)); 31 - memory_region_init_ram_nomigrate(&backend->mr, OBJECT(backend), path, 32 - backend->size, errp); 31 + memory_region_init_ram_shared_nomigrate(&backend->mr, OBJECT(backend), path, 32 + backend->size, backend->share, errp); 33 33 g_free(path); 34 34 } 35 35
+21
backends/hostmem.c
··· 368 368 backend->id = g_strdup(str); 369 369 } 370 370 371 + static bool host_memory_backend_get_share(Object *o, Error **errp) 372 + { 373 + HostMemoryBackend *backend = MEMORY_BACKEND(o); 374 + 375 + return backend->share; 376 + } 377 + 378 + static void host_memory_backend_set_share(Object *o, bool value, Error **errp) 379 + { 380 + HostMemoryBackend *backend = MEMORY_BACKEND(o); 381 + 382 + if (host_memory_backend_mr_inited(backend)) { 383 + error_setg(errp, "cannot change property value"); 384 + return; 385 + } 386 + backend->share = value; 387 + } 388 + 371 389 static void 372 390 host_memory_backend_class_init(ObjectClass *oc, void *data) 373 391 { ··· 398 416 host_memory_backend_get_policy, 399 417 host_memory_backend_set_policy, &error_abort); 400 418 object_class_property_add_str(oc, "id", get_id, set_id, &error_abort); 419 + object_class_property_add_bool(oc, "share", 420 + host_memory_backend_get_share, host_memory_backend_set_share, 421 + &error_abort); 401 422 } 402 423 403 424 static void host_memory_backend_finalize(Object *o)
+5 -4
configure
··· 1572 1572 hax HAX acceleration support 1573 1573 hvf Hypervisor.framework acceleration support 1574 1574 whpx Windows Hypervisor Platform acceleration support 1575 - rdma RDMA-based migration support 1575 + rdma Enable RDMA-based migration and PVRDMA support 1576 1576 vde support for vde network 1577 1577 netmap support for netmap network 1578 1578 linux-aio Linux AIO support ··· 2923 2923 #include <rdma/rdma_cma.h> 2924 2924 int main(void) { return 0; } 2925 2925 EOF 2926 - rdma_libs="-lrdmacm -libverbs" 2926 + rdma_libs="-lrdmacm -libverbs -libumad" 2927 2927 if compile_prog "" "$rdma_libs" ; then 2928 2928 rdma="yes" 2929 + libs_softmmu="$libs_softmmu $rdma_libs" 2929 2930 else 2930 2931 if test "$rdma" = "yes" ; then 2931 2932 error_exit \ 2932 - " OpenFabrics librdmacm/libibverbs not present." \ 2933 + " OpenFabrics librdmacm/libibverbs/libibumad not present." \ 2933 2934 " Your options:" \ 2934 - " (1) Fast: Install infiniband packages from your distro." \ 2935 + " (1) Fast: Install infiniband packages (devel) from your distro." \ 2935 2936 " (2) Cleanest: Install libraries from www.openfabrics.org" \ 2936 2937 " (3) Also: Install softiwarp if you don't have RDMA hardware" 2937 2938 fi
+255
docs/pvrdma.txt
··· 1 + Paravirtualized RDMA Device (PVRDMA) 2 + ==================================== 3 + 4 + 5 + 1. Description 6 + =============== 7 + PVRDMA is the QEMU implementation of VMware's paravirtualized RDMA device. 8 + It works with its Linux Kernel driver AS IS, no need for any special guest 9 + modifications. 10 + 11 + While it complies with the VMware device, it can also communicate with bare 12 + metal RDMA-enabled machines and does not require an RDMA HCA in the host, it 13 + can work with Soft-RoCE (rxe). 14 + 15 + It does not require the whole guest RAM to be pinned allowing memory 16 + over-commit and, even if not implemented yet, migration support will be 17 + possible with some HW assistance. 18 + 19 + A project presentation accompany this document: 20 + - http://events.linuxfoundation.org/sites/events/files/slides/lpc-2017-pvrdma-marcel-apfelbaum-yuval-shaia.pdf 21 + 22 + 23 + 24 + 2. Setup 25 + ======== 26 + 27 + 28 + 2.1 Guest setup 29 + =============== 30 + Fedora 27+ kernels work out of the box, older distributions 31 + require updating the kernel to 4.14 to include the pvrdma driver. 32 + 33 + However the libpvrdma library needed by User Level Software is still 34 + not available as part of the distributions, so the rdma-core library 35 + needs to be compiled and optionally installed. 36 + 37 + Please follow the instructions at: 38 + https://github.com/linux-rdma/rdma-core.git 39 + 40 + 41 + 2.2 Host Setup 42 + ============== 43 + The pvrdma backend is an ibdevice interface that can be exposed 44 + either by a Soft-RoCE(rxe) device on machines with no RDMA device, 45 + or an HCA SRIOV function(VF/PF). 46 + Note that ibdevice interfaces can't be shared between pvrdma devices, 47 + each one requiring a separate instance (rxe or SRIOV VF). 48 + 49 + 50 + 2.2.1 Soft-RoCE backend(rxe) 51 + =========================== 52 + A stable version of rxe is required, Fedora 27+ or a Linux 53 + Kernel 4.14+ is preferred. 54 + 55 + The rdma_rxe module is part of the Linux Kernel but not loaded by default. 56 + Install the User Level library (librxe) following the instructions from: 57 + https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home 58 + 59 + Associate an ETH interface with rxe by running: 60 + rxe_cfg add eth0 61 + An rxe0 ibdevice interface will be created and can be used as pvrdma backend. 62 + 63 + 64 + 2.2.2 RDMA device Virtual Function backend 65 + ========================================== 66 + Nothing special is required, the pvrdma device can work not only with 67 + Ethernet Links, but also Infinibands Links. 68 + All is needed is an ibdevice with an active port, for Mellanox cards 69 + will be something like mlx5_6 which can be the backend. 70 + 71 + 72 + 2.2.3 QEMU setup 73 + ================ 74 + Configure QEMU with --enable-rdma flag, installing 75 + the required RDMA libraries. 76 + 77 + 78 + 79 + 3. Usage 80 + ======== 81 + Currently the device is working only with memory backed RAM 82 + and it must be mark as "shared": 83 + -m 1G \ 84 + -object memory-backend-ram,id=mb1,size=1G,share \ 85 + -numa node,memdev=mb1 \ 86 + 87 + The pvrdma device is composed of two functions: 88 + - Function 0 is a vmxnet Ethernet Device which is redundant in Guest 89 + but is required to pass the ibdevice GID using its MAC. 90 + Examples: 91 + For an rxe backend using eth0 interface it will use its mac: 92 + -device vmxnet3,addr=<slot>.0,multifunction=on,mac=<eth0 MAC> 93 + For an SRIOV VF, we take the Ethernet Interface exposed by it: 94 + -device vmxnet3,multifunction=on,mac=<RoCE eth MAC> 95 + - Function 1 is the actual device: 96 + -device pvrdma,addr=<slot>.1,backend-dev=<ibdevice>,backend-gid-idx=<gid>,backend-port=<port> 97 + where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4) 98 + Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's MAC. 99 + The rules of conversion are part of the RoCE spec, but since manual conversion 100 + is not required, spotting problems is not hard: 101 + Example: GID: fe80:0000:0000:0000:7efe:90ff:fecb:743a 102 + MAC: 7c:fe:90:cb:74:3a 103 + Note the difference between the first byte of the MAC and the GID. 104 + 105 + 106 + 107 + 4. Implementation details 108 + ========================= 109 + 110 + 111 + 4.1 Overview 112 + ============ 113 + The device acts like a proxy between the Guest Driver and the host 114 + ibdevice interface. 115 + On configuration path: 116 + - For every hardware resource request (PD/QP/CQ/...) the pvrdma will request 117 + a resource from the backend interface, maintaining a 1-1 mapping 118 + between the guest and host. 119 + On data path: 120 + - Every post_send/receive received from the guest will be converted into 121 + a post_send/receive for the backend. The buffers data will not be touched 122 + or copied resulting in near bare-metal performance for large enough buffers. 123 + - Completions from the backend interface will result in completions for 124 + the pvrdma device. 125 + 126 + 127 + 4.2 PCI BARs 128 + ============ 129 + PCI Bars: 130 + BAR 0 - MSI-X 131 + MSI-X vectors: 132 + (0) Command - used when execution of a command is completed. 133 + (1) Async - not in use. 134 + (2) Completion - used when a completion event is placed in 135 + device's CQ ring. 136 + BAR 1 - Registers 137 + -------------------------------------------------------- 138 + | VERSION | DSR | CTL | REQ | ERR | ICR | IMR | MAC | 139 + -------------------------------------------------------- 140 + DSR - Address of driver/device shared memory used 141 + for the command channel, used for passing: 142 + - General info such as driver version 143 + - Address of 'command' and 'response' 144 + - Address of async ring 145 + - Address of device's CQ ring 146 + - Device capabilities 147 + CTL - Device control operations (activate, reset etc) 148 + IMG - Set interrupt mask 149 + REQ - Command execution register 150 + ERR - Operation status 151 + 152 + BAR 2 - UAR 153 + --------------------------------------------------------- 154 + | QP_NUM | SEND/RECV Flag || CQ_NUM | ARM/POLL Flag | 155 + --------------------------------------------------------- 156 + - Offset 0 used for QP operations (send and recv) 157 + - Offset 4 used for CQ operations (arm and poll) 158 + 159 + 160 + 4.3 Major flows 161 + =============== 162 + 163 + 4.3.1 Create CQ 164 + =============== 165 + - Guest driver 166 + - Allocates pages for CQ ring 167 + - Creates page directory (pdir) to hold CQ ring's pages 168 + - Initializes CQ ring 169 + - Initializes 'Create CQ' command object (cqe, pdir etc) 170 + - Copies the command to 'command' address 171 + - Writes 0 into REQ register 172 + - Device 173 + - Reads the request object from the 'command' address 174 + - Allocates CQ object and initialize CQ ring based on pdir 175 + - Creates the backend CQ 176 + - Writes operation status to ERR register 177 + - Posts command-interrupt to guest 178 + - Guest driver 179 + - Reads the HW response code from ERR register 180 + 181 + 4.3.2 Create QP 182 + =============== 183 + - Guest driver 184 + - Allocates pages for send and receive rings 185 + - Creates page directory(pdir) to hold the ring's pages 186 + - Initializes 'Create QP' command object (max_send_wr, 187 + send_cq_handle, recv_cq_handle, pdir etc) 188 + - Copies the object to 'command' address 189 + - Write 0 into REQ register 190 + - Device 191 + - Reads the request object from 'command' address 192 + - Allocates the QP object and initialize 193 + - Send and recv rings based on pdir 194 + - Send and recv ring state 195 + - Creates the backend QP 196 + - Writes the operation status to ERR register 197 + - Posts command-interrupt to guest 198 + - Guest driver 199 + - Reads the HW response code from ERR register 200 + 201 + 4.3.3 Post receive 202 + ================== 203 + - Guest driver 204 + - Initializes a wqe and place it on recv ring 205 + - Write to qpn|qp_recv_bit (31) to QP offset in UAR 206 + - Device 207 + - Extracts qpn from UAR 208 + - Walks through the ring and does the following for each wqe 209 + - Prepares the backend CQE context to be used when 210 + receiving completion from backend (wr_id, op_code, emu_cq_num) 211 + - For each sge prepares backend sge 212 + - Calls backend's post_recv 213 + 214 + 4.3.4 Process backend events 215 + ============================ 216 + - Done by a dedicated thread used to process backend events; 217 + at initialization is attached to the device and creates 218 + the communication channel. 219 + - Thread main loop: 220 + - Polls for completions 221 + - Extracts QEMU _cq_num, wr_id and op_code from context 222 + - Writes CQE to CQ ring 223 + - Writes CQ number to device CQ 224 + - Sends completion-interrupt to guest 225 + - Deallocates context 226 + - Acks the event to backend 227 + 228 + 229 + 230 + 5. Limitations 231 + ============== 232 + - The device obviously is limited by the Guest Linux Driver features implementation 233 + of the VMware device API. 234 + - Memory registration mechanism requires mremap for every page in the buffer in order 235 + to map it to a contiguous virtual address range. Since this is not the data path 236 + it should not matter much. If the default max mr size is increased, be aware that 237 + memory registration can take up to 0.5 seconds for 1GB of memory. 238 + - The device requires target page size to be the same as the host page size, 239 + otherwise it will fail to init. 240 + - QEMU cannot map guest RAM from a file descriptor if a pvrdma device is attached, 241 + so it can't work with huge pages. The limitation will be addressed in the future, 242 + however QEMU allocates Guest RAM with MADV_HUGEPAGE so if there are enough huge 243 + pages available, QEMU will use them. QEMU will fail to init if the requirements 244 + are not met. 245 + 246 + 247 + 248 + 6. Performance 249 + ============== 250 + By design the pvrdma device exits on each post-send/receive, so for small buffers 251 + the performance is affected; however for medium buffers it will became close to 252 + bare metal and from 1MB buffers and up it reaches bare metal performance. 253 + (tested with 2 VMs, the pvrdma devices connected to 2 VFs of the same device) 254 + 255 + All the above assumes no memory registration is done on data path.
+15 -11
exec.c
··· 1285 1285 uint16_t section); 1286 1286 static subpage_t *subpage_init(FlatView *fv, hwaddr base); 1287 1287 1288 - static void *(*phys_mem_alloc)(size_t size, uint64_t *align) = 1288 + static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) = 1289 1289 qemu_anon_ram_alloc; 1290 1290 1291 1291 /* ··· 1293 1293 * Accelerators with unusual needs may need this. Hopefully, we can 1294 1294 * get rid of it eventually. 1295 1295 */ 1296 - void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align)) 1296 + void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared)) 1297 1297 { 1298 1298 phys_mem_alloc = alloc; 1299 1299 } ··· 1921 1921 } 1922 1922 } 1923 1923 1924 - static void ram_block_add(RAMBlock *new_block, Error **errp) 1924 + static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared) 1925 1925 { 1926 1926 RAMBlock *block; 1927 1927 RAMBlock *last_block = NULL; ··· 1944 1944 } 1945 1945 } else { 1946 1946 new_block->host = phys_mem_alloc(new_block->max_length, 1947 - &new_block->mr->align); 1947 + &new_block->mr->align, shared); 1948 1948 if (!new_block->host) { 1949 1949 error_setg_errno(errp, errno, 1950 1950 "cannot set up guest memory '%s'", ··· 2049 2049 return NULL; 2050 2050 } 2051 2051 2052 - ram_block_add(new_block, &local_err); 2052 + ram_block_add(new_block, &local_err, share); 2053 2053 if (local_err) { 2054 2054 g_free(new_block); 2055 2055 error_propagate(errp, local_err); ··· 2091 2091 void (*resized)(const char*, 2092 2092 uint64_t length, 2093 2093 void *host), 2094 - void *host, bool resizeable, 2094 + void *host, bool resizeable, bool share, 2095 2095 MemoryRegion *mr, Error **errp) 2096 2096 { 2097 2097 RAMBlock *new_block; ··· 2114 2114 if (resizeable) { 2115 2115 new_block->flags |= RAM_RESIZEABLE; 2116 2116 } 2117 - ram_block_add(new_block, &local_err); 2117 + ram_block_add(new_block, &local_err, share); 2118 2118 if (local_err) { 2119 2119 g_free(new_block); 2120 2120 error_propagate(errp, local_err); ··· 2126 2126 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, 2127 2127 MemoryRegion *mr, Error **errp) 2128 2128 { 2129 - return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp); 2129 + return qemu_ram_alloc_internal(size, size, NULL, host, false, 2130 + false, mr, errp); 2130 2131 } 2131 2132 2132 - RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp) 2133 + RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share, 2134 + MemoryRegion *mr, Error **errp) 2133 2135 { 2134 - return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp); 2136 + return qemu_ram_alloc_internal(size, size, NULL, NULL, false, 2137 + share, mr, errp); 2135 2138 } 2136 2139 2137 2140 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz, ··· 2140 2143 void *host), 2141 2144 MemoryRegion *mr, Error **errp) 2142 2145 { 2143 - return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp); 2146 + return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, 2147 + false, mr, errp); 2144 2148 } 2145 2149 2146 2150 static void reclaim_ramblock(RAMBlock *block)
+1
hw/Makefile.objs
··· 18 18 devices-dirs-$(CONFIG_SOFTMMU) += isa/ 19 19 devices-dirs-$(CONFIG_SOFTMMU) += misc/ 20 20 devices-dirs-$(CONFIG_SOFTMMU) += net/ 21 + devices-dirs-$(CONFIG_SOFTMMU) += rdma/ 21 22 devices-dirs-$(CONFIG_SOFTMMU) += nvram/ 22 23 devices-dirs-$(CONFIG_SOFTMMU) += pci/ 23 24 devices-dirs-$(CONFIG_PCI) += pci-bridge/ pci-host/
+5
hw/rdma/Makefile.objs
··· 1 + ifeq ($(CONFIG_RDMA),y) 2 + obj-$(CONFIG_PCI) += rdma_utils.o rdma_backend.o rdma_rm.o 3 + obj-$(CONFIG_PCI) += vmw/pvrdma_dev_ring.o vmw/pvrdma_cmd.o \ 4 + vmw/pvrdma_qp_ops.o vmw/pvrdma_main.o 5 + endif
+818
hw/rdma/rdma_backend.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - Generic RDMA backend 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + #include <qemu/error-report.h> 18 + #include <qapi/error.h> 19 + 20 + #include <infiniband/verbs.h> 21 + 22 + #include "trace.h" 23 + #include "rdma_utils.h" 24 + #include "rdma_rm.h" 25 + #include "rdma_backend.h" 26 + 27 + /* Vendor Errors */ 28 + #define VENDOR_ERR_FAIL_BACKEND 0x201 29 + #define VENDOR_ERR_TOO_MANY_SGES 0x202 30 + #define VENDOR_ERR_NOMEM 0x203 31 + #define VENDOR_ERR_QP0 0x204 32 + #define VENDOR_ERR_NO_SGE 0x205 33 + #define VENDOR_ERR_MAD_SEND 0x206 34 + #define VENDOR_ERR_INVLKEY 0x207 35 + #define VENDOR_ERR_MR_SMALL 0x208 36 + 37 + #define THR_NAME_LEN 16 38 + 39 + typedef struct BackendCtx { 40 + uint64_t req_id; 41 + void *up_ctx; 42 + bool is_tx_req; 43 + } BackendCtx; 44 + 45 + static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); 46 + 47 + static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) 48 + { 49 + pr_err("No completion handler is registered\n"); 50 + } 51 + 52 + static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) 53 + { 54 + int i, ne; 55 + BackendCtx *bctx; 56 + struct ibv_wc wc[2]; 57 + 58 + pr_dbg("Entering poll_cq loop on cq %p\n", ibcq); 59 + do { 60 + ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc); 61 + 62 + pr_dbg("Got %d completion(s) from cq %p\n", ne, ibcq); 63 + 64 + for (i = 0; i < ne; i++) { 65 + pr_dbg("wr_id=0x%lx\n", wc[i].wr_id); 66 + pr_dbg("status=%d\n", wc[i].status); 67 + 68 + bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); 69 + if (unlikely(!bctx)) { 70 + pr_dbg("Error: Failed to find ctx for req %ld\n", wc[i].wr_id); 71 + continue; 72 + } 73 + pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv"); 74 + 75 + comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx); 76 + 77 + rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id); 78 + g_free(bctx); 79 + } 80 + } while (ne > 0); 81 + 82 + if (ne < 0) { 83 + pr_dbg("Got error %d from ibv_poll_cq\n", ne); 84 + } 85 + } 86 + 87 + static void *comp_handler_thread(void *arg) 88 + { 89 + RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg; 90 + int rc; 91 + struct ibv_cq *ev_cq; 92 + void *ev_ctx; 93 + 94 + pr_dbg("Starting\n"); 95 + 96 + while (backend_dev->comp_thread.run) { 97 + pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel); 98 + rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); 99 + pr_dbg("ibv_get_cq_event=%d\n", rc); 100 + if (unlikely(rc)) { 101 + pr_dbg("---> ibv_get_cq_event (%d)\n", rc); 102 + continue; 103 + } 104 + 105 + rc = ibv_req_notify_cq(ev_cq, 0); 106 + if (unlikely(rc)) { 107 + pr_dbg("Error %d from ibv_req_notify_cq\n", rc); 108 + } 109 + 110 + poll_cq(backend_dev->rdma_dev_res, ev_cq); 111 + 112 + ibv_ack_cq_events(ev_cq, 1); 113 + } 114 + 115 + pr_dbg("Going down\n"); 116 + 117 + /* TODO: Post cqe for all remaining buffs that were posted */ 118 + 119 + return NULL; 120 + } 121 + 122 + void rdma_backend_register_comp_handler(void (*handler)(int status, 123 + unsigned int vendor_err, void *ctx)) 124 + { 125 + comp_handler = handler; 126 + } 127 + 128 + void rdma_backend_unregister_comp_handler(void) 129 + { 130 + rdma_backend_register_comp_handler(dummy_comp_handler); 131 + } 132 + 133 + int rdma_backend_query_port(RdmaBackendDev *backend_dev, 134 + struct ibv_port_attr *port_attr) 135 + { 136 + int rc; 137 + 138 + rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr); 139 + if (rc) { 140 + pr_dbg("Error %d from ibv_query_port\n", rc); 141 + return -EIO; 142 + } 143 + 144 + return 0; 145 + } 146 + 147 + void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) 148 + { 149 + poll_cq(rdma_dev_res, cq->ibcq); 150 + } 151 + 152 + static GHashTable *ah_hash; 153 + 154 + static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd, 155 + uint8_t sgid_idx, union ibv_gid *dgid) 156 + { 157 + GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid)); 158 + struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key); 159 + 160 + if (ah) { 161 + trace_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix), 162 + be64_to_cpu(dgid->global.interface_id)); 163 + g_bytes_unref(ah_key); 164 + } else { 165 + struct ibv_ah_attr ah_attr = { 166 + .is_global = 1, 167 + .port_num = backend_dev->port_num, 168 + .grh.hop_limit = 1, 169 + }; 170 + 171 + ah_attr.grh.dgid = *dgid; 172 + ah_attr.grh.sgid_index = sgid_idx; 173 + 174 + ah = ibv_create_ah(pd, &ah_attr); 175 + if (ah) { 176 + g_hash_table_insert(ah_hash, ah_key, ah); 177 + } else { 178 + g_bytes_unref(ah_key); 179 + pr_dbg("ibv_create_ah failed for gid <%lx %lx>\n", 180 + be64_to_cpu(dgid->global.subnet_prefix), 181 + be64_to_cpu(dgid->global.interface_id)); 182 + } 183 + 184 + trace_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix), 185 + be64_to_cpu(dgid->global.interface_id)); 186 + } 187 + 188 + return ah; 189 + } 190 + 191 + static void destroy_ah_hash_key(gpointer data) 192 + { 193 + g_bytes_unref(data); 194 + } 195 + 196 + static void destroy_ah_hast_data(gpointer data) 197 + { 198 + struct ibv_ah *ah = data; 199 + 200 + ibv_destroy_ah(ah); 201 + } 202 + 203 + static void ah_cache_init(void) 204 + { 205 + ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal, 206 + destroy_ah_hash_key, destroy_ah_hast_data); 207 + } 208 + 209 + static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, 210 + struct ibv_sge *dsge, struct ibv_sge *ssge, 211 + uint8_t num_sge) 212 + { 213 + RdmaRmMR *mr; 214 + int ssge_idx; 215 + 216 + pr_dbg("num_sge=%d\n", num_sge); 217 + 218 + for (ssge_idx = 0; ssge_idx < num_sge; ssge_idx++) { 219 + mr = rdma_rm_get_mr(rdma_dev_res, ssge[ssge_idx].lkey); 220 + if (unlikely(!mr)) { 221 + pr_dbg("Invalid lkey 0x%x\n", ssge[ssge_idx].lkey); 222 + return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey; 223 + } 224 + 225 + dsge->addr = mr->user_mr.host_virt + ssge[ssge_idx].addr - 226 + mr->user_mr.guest_start; 227 + dsge->length = ssge[ssge_idx].length; 228 + dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr); 229 + 230 + pr_dbg("ssge->addr=0x%lx\n", (uint64_t)ssge[ssge_idx].addr); 231 + pr_dbg("dsge->addr=0x%lx\n", dsge->addr); 232 + pr_dbg("dsge->length=%d\n", dsge->length); 233 + pr_dbg("dsge->lkey=0x%x\n", dsge->lkey); 234 + 235 + dsge++; 236 + } 237 + 238 + return 0; 239 + } 240 + 241 + void rdma_backend_post_send(RdmaBackendDev *backend_dev, 242 + RdmaBackendQP *qp, uint8_t qp_type, 243 + struct ibv_sge *sge, uint32_t num_sge, 244 + union ibv_gid *dgid, uint32_t dqpn, 245 + uint32_t dqkey, void *ctx) 246 + { 247 + BackendCtx *bctx; 248 + struct ibv_sge new_sge[MAX_SGE]; 249 + uint32_t bctx_id; 250 + int rc; 251 + struct ibv_send_wr wr = {0}, *bad_wr; 252 + 253 + if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */ 254 + if (qp_type == IBV_QPT_SMI) { 255 + pr_dbg("QP0 unsupported\n"); 256 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 257 + } else if (qp_type == IBV_QPT_GSI) { 258 + pr_dbg("QP1\n"); 259 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); 260 + } 261 + pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type); 262 + return; 263 + } 264 + 265 + pr_dbg("num_sge=%d\n", num_sge); 266 + if (!num_sge) { 267 + pr_dbg("num_sge=0\n"); 268 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx); 269 + return; 270 + } 271 + 272 + bctx = g_malloc0(sizeof(*bctx)); 273 + bctx->up_ctx = ctx; 274 + bctx->is_tx_req = 1; 275 + 276 + rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx); 277 + if (unlikely(rc)) { 278 + pr_dbg("Failed to allocate cqe_ctx\n"); 279 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 280 + goto out_free_bctx; 281 + } 282 + 283 + rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge); 284 + if (rc) { 285 + pr_dbg("Error: Failed to build host SGE array\n"); 286 + comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); 287 + goto out_dealloc_cqe_ctx; 288 + } 289 + 290 + if (qp_type == IBV_QPT_UD) { 291 + wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, 292 + backend_dev->backend_gid_idx, dgid); 293 + wr.wr.ud.remote_qpn = dqpn; 294 + wr.wr.ud.remote_qkey = dqkey; 295 + } 296 + 297 + wr.num_sge = num_sge; 298 + wr.opcode = IBV_WR_SEND; 299 + wr.send_flags = IBV_SEND_SIGNALED; 300 + wr.sg_list = new_sge; 301 + wr.wr_id = bctx_id; 302 + 303 + rc = ibv_post_send(qp->ibqp, &wr, &bad_wr); 304 + pr_dbg("ibv_post_send=%d\n", rc); 305 + if (rc) { 306 + pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno, 307 + qp->ibqp->qp_num); 308 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 309 + goto out_dealloc_cqe_ctx; 310 + } 311 + 312 + return; 313 + 314 + out_dealloc_cqe_ctx: 315 + rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id); 316 + 317 + out_free_bctx: 318 + g_free(bctx); 319 + } 320 + 321 + void rdma_backend_post_recv(RdmaBackendDev *backend_dev, 322 + RdmaDeviceResources *rdma_dev_res, 323 + RdmaBackendQP *qp, uint8_t qp_type, 324 + struct ibv_sge *sge, uint32_t num_sge, void *ctx) 325 + { 326 + BackendCtx *bctx; 327 + struct ibv_sge new_sge[MAX_SGE]; 328 + uint32_t bctx_id; 329 + int rc; 330 + struct ibv_recv_wr wr = {0}, *bad_wr; 331 + 332 + if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */ 333 + if (qp_type == IBV_QPT_SMI) { 334 + pr_dbg("QP0 unsupported\n"); 335 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); 336 + } 337 + if (qp_type == IBV_QPT_GSI) { 338 + pr_dbg("QP1\n"); 339 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); 340 + } 341 + return; 342 + } 343 + 344 + pr_dbg("num_sge=%d\n", num_sge); 345 + if (!num_sge) { 346 + pr_dbg("num_sge=0\n"); 347 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx); 348 + return; 349 + } 350 + 351 + bctx = g_malloc0(sizeof(*bctx)); 352 + bctx->up_ctx = ctx; 353 + bctx->is_tx_req = 0; 354 + 355 + rc = rdma_rm_alloc_cqe_ctx(rdma_dev_res, &bctx_id, bctx); 356 + if (unlikely(rc)) { 357 + pr_dbg("Failed to allocate cqe_ctx\n"); 358 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); 359 + goto out_free_bctx; 360 + } 361 + 362 + rc = build_host_sge_array(rdma_dev_res, new_sge, sge, num_sge); 363 + if (rc) { 364 + pr_dbg("Error: Failed to build host SGE array\n"); 365 + comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); 366 + goto out_dealloc_cqe_ctx; 367 + } 368 + 369 + wr.num_sge = num_sge; 370 + wr.sg_list = new_sge; 371 + wr.wr_id = bctx_id; 372 + rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr); 373 + pr_dbg("ibv_post_recv=%d\n", rc); 374 + if (rc) { 375 + pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc, errno, 376 + qp->ibqp->qp_num); 377 + comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); 378 + goto out_dealloc_cqe_ctx; 379 + } 380 + 381 + return; 382 + 383 + out_dealloc_cqe_ctx: 384 + rdma_rm_dealloc_cqe_ctx(rdma_dev_res, bctx_id); 385 + 386 + out_free_bctx: 387 + g_free(bctx); 388 + } 389 + 390 + int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd) 391 + { 392 + pd->ibpd = ibv_alloc_pd(backend_dev->context); 393 + 394 + return pd->ibpd ? 0 : -EIO; 395 + } 396 + 397 + void rdma_backend_destroy_pd(RdmaBackendPD *pd) 398 + { 399 + if (pd->ibpd) { 400 + ibv_dealloc_pd(pd->ibpd); 401 + } 402 + } 403 + 404 + int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, uint64_t addr, 405 + size_t length, int access) 406 + { 407 + pr_dbg("addr=0x%lx\n", addr); 408 + pr_dbg("len=%ld\n", length); 409 + mr->ibmr = ibv_reg_mr(pd->ibpd, (void *)addr, length, access); 410 + if (mr->ibmr) { 411 + pr_dbg("lkey=0x%x\n", mr->ibmr->lkey); 412 + pr_dbg("rkey=0x%x\n", mr->ibmr->rkey); 413 + mr->ibpd = pd->ibpd; 414 + } 415 + 416 + return mr->ibmr ? 0 : -EIO; 417 + } 418 + 419 + void rdma_backend_destroy_mr(RdmaBackendMR *mr) 420 + { 421 + if (mr->ibmr) { 422 + ibv_dereg_mr(mr->ibmr); 423 + } 424 + } 425 + 426 + int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, 427 + int cqe) 428 + { 429 + int rc; 430 + 431 + pr_dbg("cqe=%d\n", cqe); 432 + 433 + pr_dbg("dev->channel=%p\n", backend_dev->channel); 434 + cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL, 435 + backend_dev->channel, 0); 436 + 437 + if (cq->ibcq) { 438 + rc = ibv_req_notify_cq(cq->ibcq, 0); 439 + if (rc) { 440 + pr_dbg("Error %d from ibv_req_notify_cq\n", rc); 441 + } 442 + cq->backend_dev = backend_dev; 443 + } 444 + 445 + return cq->ibcq ? 0 : -EIO; 446 + } 447 + 448 + void rdma_backend_destroy_cq(RdmaBackendCQ *cq) 449 + { 450 + if (cq->ibcq) { 451 + ibv_destroy_cq(cq->ibcq); 452 + } 453 + } 454 + 455 + int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, 456 + RdmaBackendPD *pd, RdmaBackendCQ *scq, 457 + RdmaBackendCQ *rcq, uint32_t max_send_wr, 458 + uint32_t max_recv_wr, uint32_t max_send_sge, 459 + uint32_t max_recv_sge) 460 + { 461 + struct ibv_qp_init_attr attr = {0}; 462 + 463 + qp->ibqp = 0; 464 + pr_dbg("qp_type=%d\n", qp_type); 465 + 466 + switch (qp_type) { 467 + case IBV_QPT_GSI: 468 + pr_dbg("QP1 unsupported\n"); 469 + return 0; 470 + 471 + case IBV_QPT_RC: 472 + /* fall through */ 473 + case IBV_QPT_UD: 474 + /* do nothing */ 475 + break; 476 + 477 + default: 478 + pr_dbg("Unsupported QP type %d\n", qp_type); 479 + return -EIO; 480 + } 481 + 482 + attr.qp_type = qp_type; 483 + attr.send_cq = scq->ibcq; 484 + attr.recv_cq = rcq->ibcq; 485 + attr.cap.max_send_wr = max_send_wr; 486 + attr.cap.max_recv_wr = max_recv_wr; 487 + attr.cap.max_send_sge = max_send_sge; 488 + attr.cap.max_recv_sge = max_recv_sge; 489 + 490 + pr_dbg("max_send_wr=%d\n", max_send_wr); 491 + pr_dbg("max_recv_wr=%d\n", max_recv_wr); 492 + pr_dbg("max_send_sge=%d\n", max_send_sge); 493 + pr_dbg("max_recv_sge=%d\n", max_recv_sge); 494 + 495 + qp->ibqp = ibv_create_qp(pd->ibpd, &attr); 496 + if (likely(!qp->ibqp)) { 497 + pr_dbg("Error from ibv_create_qp\n"); 498 + return -EIO; 499 + } 500 + 501 + qp->ibpd = pd->ibpd; 502 + 503 + /* TODO: Query QP to get max_inline_data and save it to be used in send */ 504 + 505 + pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num); 506 + 507 + return 0; 508 + } 509 + 510 + int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 511 + uint8_t qp_type, uint32_t qkey) 512 + { 513 + struct ibv_qp_attr attr = {0}; 514 + int rc, attr_mask; 515 + 516 + pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num); 517 + pr_dbg("sport_num=%d\n", backend_dev->port_num); 518 + 519 + attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; 520 + attr.qp_state = IBV_QPS_INIT; 521 + attr.pkey_index = 0; 522 + attr.port_num = backend_dev->port_num; 523 + 524 + switch (qp_type) { 525 + case IBV_QPT_RC: 526 + attr_mask |= IBV_QP_ACCESS_FLAGS; 527 + break; 528 + 529 + case IBV_QPT_UD: 530 + attr.qkey = qkey; 531 + attr_mask |= IBV_QP_QKEY; 532 + break; 533 + 534 + default: 535 + pr_dbg("Unsupported QP type %d\n", qp_type); 536 + return -EIO; 537 + } 538 + 539 + rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 540 + if (rc) { 541 + pr_dbg("Error %d from ibv_modify_qp\n", rc); 542 + return -EIO; 543 + } 544 + 545 + return 0; 546 + } 547 + 548 + int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 549 + uint8_t qp_type, union ibv_gid *dgid, 550 + uint32_t dqpn, uint32_t rq_psn, uint32_t qkey, 551 + bool use_qkey) 552 + { 553 + struct ibv_qp_attr attr = {0}; 554 + union ibv_gid ibv_gid = { 555 + .global.interface_id = dgid->global.interface_id, 556 + .global.subnet_prefix = dgid->global.subnet_prefix 557 + }; 558 + int rc, attr_mask; 559 + 560 + attr.qp_state = IBV_QPS_RTR; 561 + attr_mask = IBV_QP_STATE; 562 + 563 + switch (qp_type) { 564 + case IBV_QPT_RC: 565 + pr_dbg("dgid=0x%lx,%lx\n", 566 + be64_to_cpu(ibv_gid.global.subnet_prefix), 567 + be64_to_cpu(ibv_gid.global.interface_id)); 568 + pr_dbg("dqpn=0x%x\n", dqpn); 569 + pr_dbg("sgid_idx=%d\n", backend_dev->backend_gid_idx); 570 + pr_dbg("sport_num=%d\n", backend_dev->port_num); 571 + pr_dbg("rq_psn=0x%x\n", rq_psn); 572 + 573 + attr.path_mtu = IBV_MTU_1024; 574 + attr.dest_qp_num = dqpn; 575 + attr.max_dest_rd_atomic = 1; 576 + attr.min_rnr_timer = 12; 577 + attr.ah_attr.port_num = backend_dev->port_num; 578 + attr.ah_attr.is_global = 1; 579 + attr.ah_attr.grh.hop_limit = 1; 580 + attr.ah_attr.grh.dgid = ibv_gid; 581 + attr.ah_attr.grh.sgid_index = backend_dev->backend_gid_idx; 582 + attr.rq_psn = rq_psn; 583 + 584 + attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | 585 + IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | 586 + IBV_QP_MIN_RNR_TIMER; 587 + break; 588 + 589 + case IBV_QPT_UD: 590 + if (use_qkey) { 591 + pr_dbg("qkey=0x%x\n", qkey); 592 + attr.qkey = qkey; 593 + attr_mask |= IBV_QP_QKEY; 594 + } 595 + break; 596 + } 597 + 598 + rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 599 + if (rc) { 600 + pr_dbg("Error %d from ibv_modify_qp\n", rc); 601 + return -EIO; 602 + } 603 + 604 + return 0; 605 + } 606 + 607 + int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type, 608 + uint32_t sq_psn, uint32_t qkey, bool use_qkey) 609 + { 610 + struct ibv_qp_attr attr = {0}; 611 + int rc, attr_mask; 612 + 613 + pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num); 614 + pr_dbg("sq_psn=0x%x\n", sq_psn); 615 + 616 + attr.qp_state = IBV_QPS_RTS; 617 + attr.sq_psn = sq_psn; 618 + attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN; 619 + 620 + switch (qp_type) { 621 + case IBV_QPT_RC: 622 + attr.timeout = 14; 623 + attr.retry_cnt = 7; 624 + attr.rnr_retry = 7; 625 + attr.max_rd_atomic = 1; 626 + 627 + attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 628 + IBV_QP_MAX_QP_RD_ATOMIC; 629 + break; 630 + 631 + case IBV_QPT_UD: 632 + if (use_qkey) { 633 + pr_dbg("qkey=0x%x\n", qkey); 634 + attr.qkey = qkey; 635 + attr_mask |= IBV_QP_QKEY; 636 + } 637 + break; 638 + } 639 + 640 + rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask); 641 + if (rc) { 642 + pr_dbg("Error %d from ibv_modify_qp\n", rc); 643 + return -EIO; 644 + } 645 + 646 + return 0; 647 + } 648 + 649 + void rdma_backend_destroy_qp(RdmaBackendQP *qp) 650 + { 651 + if (qp->ibqp) { 652 + ibv_destroy_qp(qp->ibqp); 653 + } 654 + } 655 + 656 + #define CHK_ATTR(req, dev, member, fmt) ({ \ 657 + pr_dbg("%s="fmt","fmt"\n", #member, dev.member, req->member); \ 658 + if (req->member > dev.member) { \ 659 + warn_report("%s = 0x%lx is higher than host device capability 0x%lx", \ 660 + #member, (uint64_t)req->member, (uint64_t)dev.member); \ 661 + req->member = dev.member; \ 662 + } \ 663 + pr_dbg("%s="fmt"\n", #member, req->member); }) 664 + 665 + static int init_device_caps(RdmaBackendDev *backend_dev, 666 + struct ibv_device_attr *dev_attr) 667 + { 668 + if (ibv_query_device(backend_dev->context, &backend_dev->dev_attr)) { 669 + return -EIO; 670 + } 671 + 672 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_mr_size, "%ld"); 673 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp, "%d"); 674 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_sge, "%d"); 675 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_wr, "%d"); 676 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_cq, "%d"); 677 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_cqe, "%d"); 678 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_mr, "%d"); 679 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_pd, "%d"); 680 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_rd_atom, "%d"); 681 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_init_rd_atom, "%d"); 682 + CHK_ATTR(dev_attr, backend_dev->dev_attr, max_ah, "%d"); 683 + 684 + return 0; 685 + } 686 + 687 + int rdma_backend_init(RdmaBackendDev *backend_dev, 688 + RdmaDeviceResources *rdma_dev_res, 689 + const char *backend_device_name, uint8_t port_num, 690 + uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, 691 + Error **errp) 692 + { 693 + int i; 694 + int ret = 0; 695 + int num_ibv_devices; 696 + char thread_name[THR_NAME_LEN] = {0}; 697 + struct ibv_device **dev_list; 698 + struct ibv_port_attr port_attr; 699 + 700 + backend_dev->backend_gid_idx = backend_gid_idx; 701 + backend_dev->port_num = port_num; 702 + backend_dev->rdma_dev_res = rdma_dev_res; 703 + 704 + rdma_backend_register_comp_handler(dummy_comp_handler); 705 + 706 + dev_list = ibv_get_device_list(&num_ibv_devices); 707 + if (!dev_list) { 708 + error_setg(errp, "Failed to get IB devices list"); 709 + return -EIO; 710 + } 711 + 712 + if (num_ibv_devices == 0) { 713 + error_setg(errp, "No IB devices were found"); 714 + ret = -ENXIO; 715 + goto out_free_dev_list; 716 + } 717 + 718 + if (backend_device_name) { 719 + for (i = 0; dev_list[i]; ++i) { 720 + if (!strcmp(ibv_get_device_name(dev_list[i]), 721 + backend_device_name)) { 722 + break; 723 + } 724 + } 725 + 726 + backend_dev->ib_dev = dev_list[i]; 727 + if (!backend_dev->ib_dev) { 728 + error_setg(errp, "Failed to find IB device %s", 729 + backend_device_name); 730 + ret = -EIO; 731 + goto out_free_dev_list; 732 + } 733 + } else { 734 + backend_dev->ib_dev = *dev_list; 735 + } 736 + 737 + pr_dbg("Using backend device %s, port %d, gid_idx %d\n", 738 + ibv_get_device_name(backend_dev->ib_dev), 739 + backend_dev->port_num, backend_dev->backend_gid_idx); 740 + 741 + backend_dev->context = ibv_open_device(backend_dev->ib_dev); 742 + if (!backend_dev->context) { 743 + error_setg(errp, "Failed to open IB device"); 744 + ret = -EIO; 745 + goto out; 746 + } 747 + 748 + backend_dev->channel = ibv_create_comp_channel(backend_dev->context); 749 + if (!backend_dev->channel) { 750 + error_setg(errp, "Failed to create IB communication channel"); 751 + ret = -EIO; 752 + goto out_close_device; 753 + } 754 + pr_dbg("dev->backend_dev.channel=%p\n", backend_dev->channel); 755 + 756 + ret = ibv_query_port(backend_dev->context, backend_dev->port_num, 757 + &port_attr); 758 + if (ret) { 759 + error_setg(errp, "Error %d from ibv_query_port", ret); 760 + ret = -EIO; 761 + goto out_destroy_comm_channel; 762 + } 763 + 764 + if (backend_dev->backend_gid_idx > port_attr.gid_tbl_len) { 765 + error_setg(errp, "Invalid backend_gid_idx, should be less than %d", 766 + port_attr.gid_tbl_len); 767 + goto out_destroy_comm_channel; 768 + } 769 + 770 + ret = init_device_caps(backend_dev, dev_attr); 771 + if (ret) { 772 + error_setg(errp, "Failed to initialize device capabilities"); 773 + ret = -EIO; 774 + goto out_destroy_comm_channel; 775 + } 776 + 777 + ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, 778 + backend_dev->backend_gid_idx, &backend_dev->gid); 779 + if (ret) { 780 + error_setg(errp, "Failed to query gid %d", 781 + backend_dev->backend_gid_idx); 782 + ret = -EIO; 783 + goto out_destroy_comm_channel; 784 + } 785 + pr_dbg("subnet_prefix=0x%lx\n", 786 + be64_to_cpu(backend_dev->gid.global.subnet_prefix)); 787 + pr_dbg("interface_id=0x%lx\n", 788 + be64_to_cpu(backend_dev->gid.global.interface_id)); 789 + 790 + snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", 791 + ibv_get_device_name(backend_dev->ib_dev)); 792 + backend_dev->comp_thread.run = true; 793 + qemu_thread_create(&backend_dev->comp_thread.thread, thread_name, 794 + comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); 795 + 796 + ah_cache_init(); 797 + 798 + goto out_free_dev_list; 799 + 800 + out_destroy_comm_channel: 801 + ibv_destroy_comp_channel(backend_dev->channel); 802 + 803 + out_close_device: 804 + ibv_close_device(backend_dev->context); 805 + 806 + out_free_dev_list: 807 + ibv_free_device_list(dev_list); 808 + 809 + out: 810 + return ret; 811 + } 812 + 813 + void rdma_backend_fini(RdmaBackendDev *backend_dev) 814 + { 815 + g_hash_table_destroy(ah_hash); 816 + ibv_destroy_comp_channel(backend_dev->channel); 817 + ibv_close_device(backend_dev->context); 818 + }
+98
hw/rdma/rdma_backend.h
··· 1 + /* 2 + * RDMA device: Definitions of Backend Device functions 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef RDMA_BACKEND_H 17 + #define RDMA_BACKEND_H 18 + 19 + #include <qapi/error.h> 20 + #include "rdma_rm_defs.h" 21 + #include "rdma_backend_defs.h" 22 + 23 + /* Add definition for QP0 and QP1 as there is no userspace enums for them */ 24 + enum ibv_special_qp_type { 25 + IBV_QPT_SMI = 0, 26 + IBV_QPT_GSI = 1, 27 + }; 28 + 29 + static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev *dev) 30 + { 31 + return &dev->gid; 32 + } 33 + 34 + static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp) 35 + { 36 + return qp->ibqp ? qp->ibqp->qp_num : 0; 37 + } 38 + 39 + static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr) 40 + { 41 + return mr->ibmr ? mr->ibmr->lkey : 0; 42 + } 43 + 44 + static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr) 45 + { 46 + return mr->ibmr ? mr->ibmr->rkey : 0; 47 + } 48 + 49 + int rdma_backend_init(RdmaBackendDev *backend_dev, 50 + RdmaDeviceResources *rdma_dev_res, 51 + const char *backend_device_name, uint8_t port_num, 52 + uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, 53 + Error **errp); 54 + void rdma_backend_fini(RdmaBackendDev *backend_dev); 55 + void rdma_backend_register_comp_handler(void (*handler)(int status, 56 + unsigned int vendor_err, void *ctx)); 57 + void rdma_backend_unregister_comp_handler(void); 58 + 59 + int rdma_backend_query_port(RdmaBackendDev *backend_dev, 60 + struct ibv_port_attr *port_attr); 61 + int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd); 62 + void rdma_backend_destroy_pd(RdmaBackendPD *pd); 63 + 64 + int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, uint64_t addr, 65 + size_t length, int access); 66 + void rdma_backend_destroy_mr(RdmaBackendMR *mr); 67 + 68 + int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, 69 + int cqe); 70 + void rdma_backend_destroy_cq(RdmaBackendCQ *cq); 71 + void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); 72 + 73 + int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, 74 + RdmaBackendPD *pd, RdmaBackendCQ *scq, 75 + RdmaBackendCQ *rcq, uint32_t max_send_wr, 76 + uint32_t max_recv_wr, uint32_t max_send_sge, 77 + uint32_t max_recv_sge); 78 + int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 79 + uint8_t qp_type, uint32_t qkey); 80 + int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, 81 + uint8_t qp_type, union ibv_gid *dgid, 82 + uint32_t dqpn, uint32_t rq_psn, uint32_t qkey, 83 + bool use_qkey); 84 + int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type, 85 + uint32_t sq_psn, uint32_t qkey, bool use_qkey); 86 + void rdma_backend_destroy_qp(RdmaBackendQP *qp); 87 + 88 + void rdma_backend_post_send(RdmaBackendDev *backend_dev, 89 + RdmaBackendQP *qp, uint8_t qp_type, 90 + struct ibv_sge *sge, uint32_t num_sge, 91 + union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey, 92 + void *ctx); 93 + void rdma_backend_post_recv(RdmaBackendDev *backend_dev, 94 + RdmaDeviceResources *rdma_dev_res, 95 + RdmaBackendQP *qp, uint8_t qp_type, 96 + struct ibv_sge *sge, uint32_t num_sge, void *ctx); 97 + 98 + #endif
+62
hw/rdma/rdma_backend_defs.h
··· 1 + /* 2 + * RDMA device: Definitions of Backend Device structures 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef RDMA_BACKEND_DEFS_H 17 + #define RDMA_BACKEND_DEFS_H 18 + 19 + #include <infiniband/verbs.h> 20 + #include <qemu/thread.h> 21 + 22 + typedef struct RdmaDeviceResources RdmaDeviceResources; 23 + 24 + typedef struct RdmaBackendThread { 25 + QemuThread thread; 26 + QemuMutex mutex; 27 + bool run; 28 + } RdmaBackendThread; 29 + 30 + typedef struct RdmaBackendDev { 31 + struct ibv_device_attr dev_attr; 32 + RdmaBackendThread comp_thread; 33 + union ibv_gid gid; 34 + PCIDevice *dev; 35 + RdmaDeviceResources *rdma_dev_res; 36 + struct ibv_device *ib_dev; 37 + struct ibv_context *context; 38 + struct ibv_comp_channel *channel; 39 + uint8_t port_num; 40 + uint8_t backend_gid_idx; 41 + } RdmaBackendDev; 42 + 43 + typedef struct RdmaBackendPD { 44 + struct ibv_pd *ibpd; 45 + } RdmaBackendPD; 46 + 47 + typedef struct RdmaBackendMR { 48 + struct ibv_pd *ibpd; 49 + struct ibv_mr *ibmr; 50 + } RdmaBackendMR; 51 + 52 + typedef struct RdmaBackendCQ { 53 + RdmaBackendDev *backend_dev; 54 + struct ibv_cq *ibcq; 55 + } RdmaBackendCQ; 56 + 57 + typedef struct RdmaBackendQP { 58 + struct ibv_pd *ibpd; 59 + struct ibv_qp *ibqp; 60 + } RdmaBackendQP; 61 + 62 + #endif
+544
hw/rdma/rdma_rm.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - Resource Manager Implementation 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + #include <qapi/error.h> 18 + #include <cpu.h> 19 + 20 + #include "rdma_utils.h" 21 + #include "rdma_backend.h" 22 + #include "rdma_rm.h" 23 + 24 + #define MAX_RM_TBL_NAME 16 25 + 26 + /* Page directory and page tables */ 27 + #define PG_DIR_SZ { TARGET_PAGE_SIZE / sizeof(__u64) } 28 + #define PG_TBL_SZ { TARGET_PAGE_SIZE / sizeof(__u64) } 29 + 30 + static inline void res_tbl_init(const char *name, RdmaRmResTbl *tbl, 31 + uint32_t tbl_sz, uint32_t res_sz) 32 + { 33 + tbl->tbl = g_malloc(tbl_sz * res_sz); 34 + 35 + strncpy(tbl->name, name, MAX_RM_TBL_NAME); 36 + tbl->name[MAX_RM_TBL_NAME - 1] = 0; 37 + 38 + tbl->bitmap = bitmap_new(tbl_sz); 39 + tbl->tbl_sz = tbl_sz; 40 + tbl->res_sz = res_sz; 41 + qemu_mutex_init(&tbl->lock); 42 + } 43 + 44 + static inline void res_tbl_free(RdmaRmResTbl *tbl) 45 + { 46 + qemu_mutex_destroy(&tbl->lock); 47 + g_free(tbl->tbl); 48 + bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0); 49 + } 50 + 51 + static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle) 52 + { 53 + pr_dbg("%s, handle=%d\n", tbl->name, handle); 54 + 55 + if ((handle < tbl->tbl_sz) && (test_bit(handle, tbl->bitmap))) { 56 + return tbl->tbl + handle * tbl->res_sz; 57 + } else { 58 + pr_dbg("Invalid handle %d\n", handle); 59 + return NULL; 60 + } 61 + } 62 + 63 + static inline void *res_tbl_alloc(RdmaRmResTbl *tbl, uint32_t *handle) 64 + { 65 + qemu_mutex_lock(&tbl->lock); 66 + 67 + *handle = find_first_zero_bit(tbl->bitmap, tbl->tbl_sz); 68 + if (*handle > tbl->tbl_sz) { 69 + pr_dbg("Failed to alloc, bitmap is full\n"); 70 + qemu_mutex_unlock(&tbl->lock); 71 + return NULL; 72 + } 73 + 74 + set_bit(*handle, tbl->bitmap); 75 + 76 + qemu_mutex_unlock(&tbl->lock); 77 + 78 + memset(tbl->tbl + *handle * tbl->res_sz, 0, tbl->res_sz); 79 + 80 + pr_dbg("%s, handle=%d\n", tbl->name, *handle); 81 + 82 + return tbl->tbl + *handle * tbl->res_sz; 83 + } 84 + 85 + static inline void res_tbl_dealloc(RdmaRmResTbl *tbl, uint32_t handle) 86 + { 87 + pr_dbg("%s, handle=%d\n", tbl->name, handle); 88 + 89 + qemu_mutex_lock(&tbl->lock); 90 + 91 + if (handle < tbl->tbl_sz) { 92 + clear_bit(handle, tbl->bitmap); 93 + } 94 + 95 + qemu_mutex_unlock(&tbl->lock); 96 + } 97 + 98 + int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 99 + uint32_t *pd_handle, uint32_t ctx_handle) 100 + { 101 + RdmaRmPD *pd; 102 + int ret = -ENOMEM; 103 + 104 + pd = res_tbl_alloc(&dev_res->pd_tbl, pd_handle); 105 + if (!pd) { 106 + goto out; 107 + } 108 + 109 + ret = rdma_backend_create_pd(backend_dev, &pd->backend_pd); 110 + if (ret) { 111 + ret = -EIO; 112 + goto out_tbl_dealloc; 113 + } 114 + 115 + pd->ctx_handle = ctx_handle; 116 + 117 + return 0; 118 + 119 + out_tbl_dealloc: 120 + res_tbl_dealloc(&dev_res->pd_tbl, *pd_handle); 121 + 122 + out: 123 + return ret; 124 + } 125 + 126 + RdmaRmPD *rdma_rm_get_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle) 127 + { 128 + return res_tbl_get(&dev_res->pd_tbl, pd_handle); 129 + } 130 + 131 + void rdma_rm_dealloc_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle) 132 + { 133 + RdmaRmPD *pd = rdma_rm_get_pd(dev_res, pd_handle); 134 + 135 + if (pd) { 136 + rdma_backend_destroy_pd(&pd->backend_pd); 137 + res_tbl_dealloc(&dev_res->pd_tbl, pd_handle); 138 + } 139 + } 140 + 141 + int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, 142 + uint64_t guest_start, size_t guest_length, void *host_virt, 143 + int access_flags, uint32_t *mr_handle, uint32_t *lkey, 144 + uint32_t *rkey) 145 + { 146 + RdmaRmMR *mr; 147 + int ret = 0; 148 + RdmaRmPD *pd; 149 + uint64_t addr; 150 + size_t length; 151 + 152 + pd = rdma_rm_get_pd(dev_res, pd_handle); 153 + if (!pd) { 154 + pr_dbg("Invalid PD\n"); 155 + return -EINVAL; 156 + } 157 + 158 + mr = res_tbl_alloc(&dev_res->mr_tbl, mr_handle); 159 + if (!mr) { 160 + pr_dbg("Failed to allocate obj in table\n"); 161 + return -ENOMEM; 162 + } 163 + 164 + if (!host_virt) { 165 + /* TODO: This is my guess but not so sure that this needs to be 166 + * done */ 167 + length = TARGET_PAGE_SIZE; 168 + addr = (uint64_t)g_malloc(length); 169 + } else { 170 + mr->user_mr.host_virt = (uint64_t) host_virt; 171 + pr_dbg("host_virt=0x%lx\n", mr->user_mr.host_virt); 172 + mr->user_mr.length = guest_length; 173 + pr_dbg("length=0x%lx\n", guest_length); 174 + mr->user_mr.guest_start = guest_start; 175 + pr_dbg("guest_start=0x%lx\n", mr->user_mr.guest_start); 176 + 177 + length = mr->user_mr.length; 178 + addr = mr->user_mr.host_virt; 179 + } 180 + 181 + ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, addr, length, 182 + access_flags); 183 + if (ret) { 184 + pr_dbg("Fail in rdma_backend_create_mr, err=%d\n", ret); 185 + ret = -EIO; 186 + goto out_dealloc_mr; 187 + } 188 + 189 + if (!host_virt) { 190 + *lkey = mr->lkey = rdma_backend_mr_lkey(&mr->backend_mr); 191 + *rkey = mr->rkey = rdma_backend_mr_rkey(&mr->backend_mr); 192 + } else { 193 + /* We keep mr_handle in lkey so send and recv get get mr ptr */ 194 + *lkey = *mr_handle; 195 + *rkey = -1; 196 + } 197 + 198 + mr->pd_handle = pd_handle; 199 + 200 + return 0; 201 + 202 + out_dealloc_mr: 203 + res_tbl_dealloc(&dev_res->mr_tbl, *mr_handle); 204 + 205 + return ret; 206 + } 207 + 208 + RdmaRmMR *rdma_rm_get_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle) 209 + { 210 + return res_tbl_get(&dev_res->mr_tbl, mr_handle); 211 + } 212 + 213 + void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle) 214 + { 215 + RdmaRmMR *mr = rdma_rm_get_mr(dev_res, mr_handle); 216 + 217 + if (mr) { 218 + rdma_backend_destroy_mr(&mr->backend_mr); 219 + munmap((void *)mr->user_mr.host_virt, mr->user_mr.length); 220 + res_tbl_dealloc(&dev_res->mr_tbl, mr_handle); 221 + } 222 + } 223 + 224 + int rdma_rm_alloc_uc(RdmaDeviceResources *dev_res, uint32_t pfn, 225 + uint32_t *uc_handle) 226 + { 227 + RdmaRmUC *uc; 228 + 229 + /* TODO: Need to make sure pfn is between bar start address and 230 + * bsd+RDMA_BAR2_UAR_SIZE 231 + if (pfn > RDMA_BAR2_UAR_SIZE) { 232 + pr_err("pfn out of range (%d > %d)\n", pfn, RDMA_BAR2_UAR_SIZE); 233 + return -ENOMEM; 234 + } 235 + */ 236 + 237 + uc = res_tbl_alloc(&dev_res->uc_tbl, uc_handle); 238 + if (!uc) { 239 + return -ENOMEM; 240 + } 241 + 242 + return 0; 243 + } 244 + 245 + RdmaRmUC *rdma_rm_get_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle) 246 + { 247 + return res_tbl_get(&dev_res->uc_tbl, uc_handle); 248 + } 249 + 250 + void rdma_rm_dealloc_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle) 251 + { 252 + RdmaRmUC *uc = rdma_rm_get_uc(dev_res, uc_handle); 253 + 254 + if (uc) { 255 + res_tbl_dealloc(&dev_res->uc_tbl, uc_handle); 256 + } 257 + } 258 + 259 + RdmaRmCQ *rdma_rm_get_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle) 260 + { 261 + return res_tbl_get(&dev_res->cq_tbl, cq_handle); 262 + } 263 + 264 + int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 265 + uint32_t cqe, uint32_t *cq_handle, void *opaque) 266 + { 267 + int rc; 268 + RdmaRmCQ *cq; 269 + 270 + cq = res_tbl_alloc(&dev_res->cq_tbl, cq_handle); 271 + if (!cq) { 272 + return -ENOMEM; 273 + } 274 + 275 + cq->opaque = opaque; 276 + cq->notify = false; 277 + 278 + rc = rdma_backend_create_cq(backend_dev, &cq->backend_cq, cqe); 279 + if (rc) { 280 + rc = -EIO; 281 + goto out_dealloc_cq; 282 + } 283 + 284 + return 0; 285 + 286 + out_dealloc_cq: 287 + rdma_rm_dealloc_cq(dev_res, *cq_handle); 288 + 289 + return rc; 290 + } 291 + 292 + void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle, 293 + bool notify) 294 + { 295 + RdmaRmCQ *cq; 296 + 297 + pr_dbg("cq_handle=%d, notify=0x%x\n", cq_handle, notify); 298 + 299 + cq = rdma_rm_get_cq(dev_res, cq_handle); 300 + if (!cq) { 301 + return; 302 + } 303 + 304 + cq->notify = notify; 305 + pr_dbg("notify=%d\n", cq->notify); 306 + } 307 + 308 + void rdma_rm_dealloc_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle) 309 + { 310 + RdmaRmCQ *cq; 311 + 312 + cq = rdma_rm_get_cq(dev_res, cq_handle); 313 + if (!cq) { 314 + return; 315 + } 316 + 317 + rdma_backend_destroy_cq(&cq->backend_cq); 318 + 319 + res_tbl_dealloc(&dev_res->cq_tbl, cq_handle); 320 + } 321 + 322 + RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn) 323 + { 324 + GBytes *key = g_bytes_new(&qpn, sizeof(qpn)); 325 + 326 + RdmaRmQP *qp = g_hash_table_lookup(dev_res->qp_hash, key); 327 + 328 + g_bytes_unref(key); 329 + 330 + return qp; 331 + } 332 + 333 + int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle, 334 + uint8_t qp_type, uint32_t max_send_wr, 335 + uint32_t max_send_sge, uint32_t send_cq_handle, 336 + uint32_t max_recv_wr, uint32_t max_recv_sge, 337 + uint32_t recv_cq_handle, void *opaque, uint32_t *qpn) 338 + { 339 + int rc; 340 + RdmaRmQP *qp; 341 + RdmaRmCQ *scq, *rcq; 342 + RdmaRmPD *pd; 343 + uint32_t rm_qpn; 344 + 345 + pr_dbg("qp_type=%d\n", qp_type); 346 + 347 + pd = rdma_rm_get_pd(dev_res, pd_handle); 348 + if (!pd) { 349 + pr_err("Invalid pd handle (%d)\n", pd_handle); 350 + return -EINVAL; 351 + } 352 + 353 + scq = rdma_rm_get_cq(dev_res, send_cq_handle); 354 + rcq = rdma_rm_get_cq(dev_res, recv_cq_handle); 355 + 356 + if (!scq || !rcq) { 357 + pr_err("Invalid send_cqn or recv_cqn (%d, %d)\n", 358 + send_cq_handle, recv_cq_handle); 359 + return -EINVAL; 360 + } 361 + 362 + qp = res_tbl_alloc(&dev_res->qp_tbl, &rm_qpn); 363 + if (!qp) { 364 + return -ENOMEM; 365 + } 366 + pr_dbg("rm_qpn=%d\n", rm_qpn); 367 + 368 + qp->qpn = rm_qpn; 369 + qp->qp_state = IBV_QPS_RESET; 370 + qp->qp_type = qp_type; 371 + qp->send_cq_handle = send_cq_handle; 372 + qp->recv_cq_handle = recv_cq_handle; 373 + qp->opaque = opaque; 374 + 375 + rc = rdma_backend_create_qp(&qp->backend_qp, qp_type, &pd->backend_pd, 376 + &scq->backend_cq, &rcq->backend_cq, max_send_wr, 377 + max_recv_wr, max_send_sge, max_recv_sge); 378 + if (rc) { 379 + rc = -EIO; 380 + goto out_dealloc_qp; 381 + } 382 + 383 + *qpn = rdma_backend_qpn(&qp->backend_qp); 384 + pr_dbg("rm_qpn=%d, backend_qpn=0x%x\n", rm_qpn, *qpn); 385 + g_hash_table_insert(dev_res->qp_hash, g_bytes_new(qpn, sizeof(*qpn)), qp); 386 + 387 + return 0; 388 + 389 + out_dealloc_qp: 390 + res_tbl_dealloc(&dev_res->qp_tbl, qp->qpn); 391 + 392 + return rc; 393 + } 394 + 395 + int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 396 + uint32_t qp_handle, uint32_t attr_mask, 397 + union ibv_gid *dgid, uint32_t dqpn, 398 + enum ibv_qp_state qp_state, uint32_t qkey, 399 + uint32_t rq_psn, uint32_t sq_psn) 400 + { 401 + RdmaRmQP *qp; 402 + int ret; 403 + 404 + pr_dbg("qpn=%d\n", qp_handle); 405 + 406 + qp = rdma_rm_get_qp(dev_res, qp_handle); 407 + if (!qp) { 408 + return -EINVAL; 409 + } 410 + 411 + pr_dbg("qp_type=%d\n", qp->qp_type); 412 + pr_dbg("attr_mask=0x%x\n", attr_mask); 413 + 414 + if (qp->qp_type == IBV_QPT_SMI) { 415 + pr_dbg("QP0 unsupported\n"); 416 + return -EPERM; 417 + } else if (qp->qp_type == IBV_QPT_GSI) { 418 + pr_dbg("QP1\n"); 419 + return 0; 420 + } 421 + 422 + if (attr_mask & IBV_QP_STATE) { 423 + qp->qp_state = qp_state; 424 + pr_dbg("qp_state=%d\n", qp->qp_state); 425 + 426 + if (qp->qp_state == IBV_QPS_INIT) { 427 + ret = rdma_backend_qp_state_init(backend_dev, &qp->backend_qp, 428 + qp->qp_type, qkey); 429 + if (ret) { 430 + return -EIO; 431 + } 432 + } 433 + 434 + if (qp->qp_state == IBV_QPS_RTR) { 435 + ret = rdma_backend_qp_state_rtr(backend_dev, &qp->backend_qp, 436 + qp->qp_type, dgid, dqpn, rq_psn, 437 + qkey, attr_mask & IBV_QP_QKEY); 438 + if (ret) { 439 + return -EIO; 440 + } 441 + } 442 + 443 + if (qp->qp_state == IBV_QPS_RTS) { 444 + ret = rdma_backend_qp_state_rts(&qp->backend_qp, qp->qp_type, 445 + sq_psn, qkey, 446 + attr_mask & IBV_QP_QKEY); 447 + if (ret) { 448 + return -EIO; 449 + } 450 + } 451 + } 452 + 453 + return 0; 454 + } 455 + 456 + void rdma_rm_dealloc_qp(RdmaDeviceResources *dev_res, uint32_t qp_handle) 457 + { 458 + RdmaRmQP *qp; 459 + GBytes *key; 460 + 461 + key = g_bytes_new(&qp_handle, sizeof(qp_handle)); 462 + qp = g_hash_table_lookup(dev_res->qp_hash, key); 463 + g_hash_table_remove(dev_res->qp_hash, key); 464 + g_bytes_unref(key); 465 + 466 + if (!qp) { 467 + return; 468 + } 469 + 470 + rdma_backend_destroy_qp(&qp->backend_qp); 471 + 472 + res_tbl_dealloc(&dev_res->qp_tbl, qp->qpn); 473 + } 474 + 475 + void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id) 476 + { 477 + void **cqe_ctx; 478 + 479 + cqe_ctx = res_tbl_get(&dev_res->cqe_ctx_tbl, cqe_ctx_id); 480 + if (!cqe_ctx) { 481 + return NULL; 482 + } 483 + 484 + pr_dbg("ctx=%p\n", *cqe_ctx); 485 + 486 + return *cqe_ctx; 487 + } 488 + 489 + int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t *cqe_ctx_id, 490 + void *ctx) 491 + { 492 + void **cqe_ctx; 493 + 494 + cqe_ctx = res_tbl_alloc(&dev_res->cqe_ctx_tbl, cqe_ctx_id); 495 + if (!cqe_ctx) { 496 + return -ENOMEM; 497 + } 498 + 499 + pr_dbg("ctx=%p\n", ctx); 500 + *cqe_ctx = ctx; 501 + 502 + return 0; 503 + } 504 + 505 + void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id) 506 + { 507 + res_tbl_dealloc(&dev_res->cqe_ctx_tbl, cqe_ctx_id); 508 + } 509 + 510 + static void destroy_qp_hash_key(gpointer data) 511 + { 512 + g_bytes_unref(data); 513 + } 514 + 515 + int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr, 516 + Error **errp) 517 + { 518 + dev_res->qp_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal, 519 + destroy_qp_hash_key, NULL); 520 + if (!dev_res->qp_hash) { 521 + return -ENOMEM; 522 + } 523 + 524 + res_tbl_init("PD", &dev_res->pd_tbl, dev_attr->max_pd, sizeof(RdmaRmPD)); 525 + res_tbl_init("CQ", &dev_res->cq_tbl, dev_attr->max_cq, sizeof(RdmaRmCQ)); 526 + res_tbl_init("MR", &dev_res->mr_tbl, dev_attr->max_mr, sizeof(RdmaRmMR)); 527 + res_tbl_init("QP", &dev_res->qp_tbl, dev_attr->max_qp, sizeof(RdmaRmQP)); 528 + res_tbl_init("CQE_CTX", &dev_res->cqe_ctx_tbl, dev_attr->max_qp * 529 + dev_attr->max_qp_wr, sizeof(void *)); 530 + res_tbl_init("UC", &dev_res->uc_tbl, MAX_UCS, sizeof(RdmaRmUC)); 531 + 532 + return 0; 533 + } 534 + 535 + void rdma_rm_fini(RdmaDeviceResources *dev_res) 536 + { 537 + res_tbl_free(&dev_res->uc_tbl); 538 + res_tbl_free(&dev_res->cqe_ctx_tbl); 539 + res_tbl_free(&dev_res->qp_tbl); 540 + res_tbl_free(&dev_res->cq_tbl); 541 + res_tbl_free(&dev_res->mr_tbl); 542 + res_tbl_free(&dev_res->pd_tbl); 543 + g_hash_table_destroy(dev_res->qp_hash); 544 + }
+69
hw/rdma/rdma_rm.h
··· 1 + /* 2 + * RDMA device: Definitions of Resource Manager functions 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef RDMA_RM_H 17 + #define RDMA_RM_H 18 + 19 + #include <qapi/error.h> 20 + #include "rdma_backend_defs.h" 21 + #include "rdma_rm_defs.h" 22 + 23 + int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr, 24 + Error **errp); 25 + void rdma_rm_fini(RdmaDeviceResources *dev_res); 26 + 27 + int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 28 + uint32_t *pd_handle, uint32_t ctx_handle); 29 + RdmaRmPD *rdma_rm_get_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle); 30 + void rdma_rm_dealloc_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle); 31 + 32 + int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, 33 + uint64_t guest_start, size_t guest_length, void *host_virt, 34 + int access_flags, uint32_t *mr_handle, uint32_t *lkey, 35 + uint32_t *rkey); 36 + RdmaRmMR *rdma_rm_get_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle); 37 + void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle); 38 + 39 + int rdma_rm_alloc_uc(RdmaDeviceResources *dev_res, uint32_t pfn, 40 + uint32_t *uc_handle); 41 + RdmaRmUC *rdma_rm_get_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle); 42 + void rdma_rm_dealloc_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle); 43 + 44 + int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 45 + uint32_t cqe, uint32_t *cq_handle, void *opaque); 46 + RdmaRmCQ *rdma_rm_get_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle); 47 + void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle, 48 + bool notify); 49 + void rdma_rm_dealloc_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle); 50 + 51 + int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle, 52 + uint8_t qp_type, uint32_t max_send_wr, 53 + uint32_t max_send_sge, uint32_t send_cq_handle, 54 + uint32_t max_recv_wr, uint32_t max_recv_sge, 55 + uint32_t recv_cq_handle, void *opaque, uint32_t *qpn); 56 + RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn); 57 + int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, 58 + uint32_t qp_handle, uint32_t attr_mask, 59 + union ibv_gid *dgid, uint32_t dqpn, 60 + enum ibv_qp_state qp_state, uint32_t qkey, 61 + uint32_t rq_psn, uint32_t sq_psn); 62 + void rdma_rm_dealloc_qp(RdmaDeviceResources *dev_res, uint32_t qp_handle); 63 + 64 + int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t *cqe_ctx_id, 65 + void *ctx); 66 + void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id); 67 + void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id); 68 + 69 + #endif
+104
hw/rdma/rdma_rm_defs.h
··· 1 + /* 2 + * RDMA device: Definitions of Resource Manager structures 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef RDMA_RM_DEFS_H 17 + #define RDMA_RM_DEFS_H 18 + 19 + #include "rdma_backend_defs.h" 20 + 21 + #define MAX_PORTS 1 22 + #define MAX_PORT_GIDS 1 23 + #define MAX_PORT_PKEYS 1 24 + #define MAX_PKEYS 1 25 + #define MAX_GIDS 2048 26 + #define MAX_UCS 512 27 + #define MAX_MR_SIZE (1UL << 27) 28 + #define MAX_QP 1024 29 + #define MAX_SGE 4 30 + #define MAX_CQ 2048 31 + #define MAX_MR 1024 32 + #define MAX_PD 1024 33 + #define MAX_QP_RD_ATOM 16 34 + #define MAX_QP_INIT_RD_ATOM 16 35 + #define MAX_AH 64 36 + 37 + #define MAX_RMRESTBL_NAME_SZ 16 38 + typedef struct RdmaRmResTbl { 39 + char name[MAX_RMRESTBL_NAME_SZ]; 40 + QemuMutex lock; 41 + unsigned long *bitmap; 42 + size_t tbl_sz; 43 + size_t res_sz; 44 + void *tbl; 45 + } RdmaRmResTbl; 46 + 47 + typedef struct RdmaRmPD { 48 + RdmaBackendPD backend_pd; 49 + uint32_t ctx_handle; 50 + } RdmaRmPD; 51 + 52 + typedef struct RdmaRmCQ { 53 + RdmaBackendCQ backend_cq; 54 + void *opaque; 55 + bool notify; 56 + } RdmaRmCQ; 57 + 58 + typedef struct RdmaRmUserMR { 59 + uint64_t host_virt; 60 + uint64_t guest_start; 61 + size_t length; 62 + } RdmaRmUserMR; 63 + 64 + /* MR (DMA region) */ 65 + typedef struct RdmaRmMR { 66 + RdmaBackendMR backend_mr; 67 + RdmaRmUserMR user_mr; 68 + uint32_t pd_handle; 69 + uint32_t lkey; 70 + uint32_t rkey; 71 + } RdmaRmMR; 72 + 73 + typedef struct RdmaRmUC { 74 + uint64_t uc_handle; 75 + } RdmaRmUC; 76 + 77 + typedef struct RdmaRmQP { 78 + RdmaBackendQP backend_qp; 79 + void *opaque; 80 + uint32_t qp_type; 81 + uint32_t qpn; 82 + uint32_t send_cq_handle; 83 + uint32_t recv_cq_handle; 84 + enum ibv_qp_state qp_state; 85 + } RdmaRmQP; 86 + 87 + typedef struct RdmaRmPort { 88 + union ibv_gid gid_tbl[MAX_PORT_GIDS]; 89 + enum ibv_port_state state; 90 + int *pkey_tbl; /* TODO: Not yet supported */ 91 + } RdmaRmPort; 92 + 93 + typedef struct RdmaDeviceResources { 94 + RdmaRmPort ports[MAX_PORTS]; 95 + RdmaRmResTbl pd_tbl; 96 + RdmaRmResTbl mr_tbl; 97 + RdmaRmResTbl uc_tbl; 98 + RdmaRmResTbl qp_tbl; 99 + RdmaRmResTbl cq_tbl; 100 + RdmaRmResTbl cqe_ctx_tbl; 101 + GHashTable *qp_hash; /* Keeps mapping between real and emulated */ 102 + } RdmaDeviceResources; 103 + 104 + #endif
+51
hw/rdma/rdma_utils.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - Generic RDMA backend 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include "rdma_utils.h" 17 + 18 + void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen) 19 + { 20 + void *p; 21 + hwaddr len = plen; 22 + 23 + if (!addr) { 24 + pr_dbg("addr is NULL\n"); 25 + return NULL; 26 + } 27 + 28 + p = pci_dma_map(dev, addr, &len, DMA_DIRECTION_TO_DEVICE); 29 + if (!p) { 30 + pr_dbg("Fail in pci_dma_map, addr=0x%llx, len=%ld\n", 31 + (long long unsigned int)addr, len); 32 + return NULL; 33 + } 34 + 35 + if (len != plen) { 36 + rdma_pci_dma_unmap(dev, p, len); 37 + return NULL; 38 + } 39 + 40 + pr_dbg("0x%llx -> %p (len=%ld)\n", (long long unsigned int)addr, p, len); 41 + 42 + return p; 43 + } 44 + 45 + void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len) 46 + { 47 + pr_dbg("%p\n", buffer); 48 + if (buffer) { 49 + pci_dma_unmap(dev, buffer, len, DMA_DIRECTION_TO_DEVICE, 0); 50 + } 51 + }
+43
hw/rdma/rdma_utils.h
··· 1 + /* 2 + * RDMA device: Debug utilities 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * 8 + * Authors: 9 + * Yuval Shaia <yuval.shaia@oracle.com> 10 + * Marcel Apfelbaum <marcel@redhat.com> 11 + * 12 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 13 + * See the COPYING file in the top-level directory. 14 + * 15 + */ 16 + 17 + #ifndef RDMA_UTILS_H 18 + #define RDMA_UTILS_H 19 + 20 + #include <qemu/osdep.h> 21 + #include <include/hw/pci/pci.h> 22 + #include <include/sysemu/dma.h> 23 + 24 + #define pr_info(fmt, ...) \ 25 + fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ 26 + ## __VA_ARGS__) 27 + 28 + #define pr_err(fmt, ...) \ 29 + fprintf(stderr, "%s: Error at %-20s (%3d): " fmt, "pvrdma", __func__, \ 30 + __LINE__, ## __VA_ARGS__) 31 + 32 + #ifdef PVRDMA_DEBUG 33 + #define pr_dbg(fmt, ...) \ 34 + fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ 35 + ## __VA_ARGS__) 36 + #else 37 + #define pr_dbg(fmt, ...) 38 + #endif 39 + 40 + void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen); 41 + void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len); 42 + 43 + #endif
+5
hw/rdma/trace-events
··· 1 + # See docs/tracing.txt for syntax documentation. 2 + 3 + #hw/rdma/rdma_backend.c 4 + create_ah_cache_hit(uint64_t subnet, uint64_t net_id) "subnet = 0x%"PRIx64" net_id = 0x%"PRIx64 5 + create_ah_cache_miss(uint64_t subnet, uint64_t net_id) "subnet = 0x%"PRIx64" net_id = 0x%"PRIx64
+122
hw/rdma/vmw/pvrdma.h
··· 1 + /* 2 + * QEMU VMWARE paravirtual RDMA device definitions 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef PVRDMA_PVRDMA_H 17 + #define PVRDMA_PVRDMA_H 18 + 19 + #include <hw/pci/pci.h> 20 + #include <hw/pci/msix.h> 21 + 22 + #include "../rdma_backend_defs.h" 23 + #include "../rdma_rm_defs.h" 24 + 25 + #include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h> 26 + #include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h> 27 + #include "pvrdma_dev_ring.h" 28 + 29 + /* BARs */ 30 + #define RDMA_MSIX_BAR_IDX 0 31 + #define RDMA_REG_BAR_IDX 1 32 + #define RDMA_UAR_BAR_IDX 2 33 + #define RDMA_BAR0_MSIX_SIZE (16 * 1024) 34 + #define RDMA_BAR1_REGS_SIZE 256 35 + #define RDMA_BAR2_UAR_SIZE (0x1000 * MAX_UCS) /* each uc gets page */ 36 + 37 + /* MSIX */ 38 + #define RDMA_MAX_INTRS 3 39 + #define RDMA_MSIX_TABLE 0x0000 40 + #define RDMA_MSIX_PBA 0x2000 41 + 42 + /* Interrupts Vectors */ 43 + #define INTR_VEC_CMD_RING 0 44 + #define INTR_VEC_CMD_ASYNC_EVENTS 1 45 + #define INTR_VEC_CMD_COMPLETION_Q 2 46 + 47 + /* HW attributes */ 48 + #define PVRDMA_HW_NAME "pvrdma" 49 + #define PVRDMA_HW_VERSION 17 50 + #define PVRDMA_FW_VERSION 14 51 + 52 + typedef struct DSRInfo { 53 + dma_addr_t dma; 54 + struct pvrdma_device_shared_region *dsr; 55 + 56 + union pvrdma_cmd_req *req; 57 + union pvrdma_cmd_resp *rsp; 58 + 59 + struct pvrdma_ring *async_ring_state; 60 + PvrdmaRing async; 61 + 62 + struct pvrdma_ring *cq_ring_state; 63 + PvrdmaRing cq; 64 + } DSRInfo; 65 + 66 + typedef struct PVRDMADev { 67 + PCIDevice parent_obj; 68 + MemoryRegion msix; 69 + MemoryRegion regs; 70 + uint32_t regs_data[RDMA_BAR1_REGS_SIZE]; 71 + MemoryRegion uar; 72 + uint32_t uar_data[RDMA_BAR2_UAR_SIZE]; 73 + DSRInfo dsr_info; 74 + int interrupt_mask; 75 + struct ibv_device_attr dev_attr; 76 + uint64_t node_guid; 77 + char *backend_device_name; 78 + uint8_t backend_gid_idx; 79 + uint8_t backend_port_num; 80 + RdmaBackendDev backend_dev; 81 + RdmaDeviceResources rdma_dev_res; 82 + } PVRDMADev; 83 + #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) 84 + 85 + static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val) 86 + { 87 + int idx = addr >> 2; 88 + 89 + if (idx > RDMA_BAR1_REGS_SIZE) { 90 + return -EINVAL; 91 + } 92 + 93 + *val = dev->regs_data[idx]; 94 + 95 + return 0; 96 + } 97 + 98 + static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val) 99 + { 100 + int idx = addr >> 2; 101 + 102 + if (idx > RDMA_BAR1_REGS_SIZE) { 103 + return -EINVAL; 104 + } 105 + 106 + dev->regs_data[idx] = val; 107 + 108 + return 0; 109 + } 110 + 111 + static inline void post_interrupt(PVRDMADev *dev, unsigned vector) 112 + { 113 + PCIDevice *pci_dev = PCI_DEVICE(dev); 114 + 115 + if (likely(!dev->interrupt_mask)) { 116 + msix_notify(pci_dev, vector); 117 + } 118 + } 119 + 120 + int execute_command(PVRDMADev *dev); 121 + 122 + #endif
+673
hw/rdma/vmw/pvrdma_cmd.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - Command channel 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + #include <qemu/error-report.h> 18 + #include <cpu.h> 19 + #include <linux/types.h> 20 + #include "hw/hw.h" 21 + #include "hw/pci/pci.h" 22 + #include "hw/pci/pci_ids.h" 23 + 24 + #include "../rdma_backend.h" 25 + #include "../rdma_rm.h" 26 + #include "../rdma_utils.h" 27 + 28 + #include "pvrdma.h" 29 + #include <standard-headers/rdma/vmw_pvrdma-abi.h> 30 + 31 + static void *pvrdma_map_to_pdir(PCIDevice *pdev, uint64_t pdir_dma, 32 + uint32_t nchunks, size_t length) 33 + { 34 + uint64_t *dir, *tbl; 35 + int tbl_idx, dir_idx, addr_idx; 36 + void *host_virt = NULL, *curr_page; 37 + 38 + if (!nchunks) { 39 + pr_dbg("nchunks=0\n"); 40 + return NULL; 41 + } 42 + 43 + dir = rdma_pci_dma_map(pdev, pdir_dma, TARGET_PAGE_SIZE); 44 + if (!dir) { 45 + error_report("PVRDMA: Failed to map to page directory"); 46 + return NULL; 47 + } 48 + 49 + tbl = rdma_pci_dma_map(pdev, dir[0], TARGET_PAGE_SIZE); 50 + if (!tbl) { 51 + error_report("PVRDMA: Failed to map to page table 0"); 52 + goto out_unmap_dir; 53 + } 54 + 55 + curr_page = rdma_pci_dma_map(pdev, (dma_addr_t)tbl[0], TARGET_PAGE_SIZE); 56 + if (!curr_page) { 57 + error_report("PVRDMA: Failed to map the first page"); 58 + goto out_unmap_tbl; 59 + } 60 + 61 + host_virt = mremap(curr_page, 0, length, MREMAP_MAYMOVE); 62 + if (host_virt == MAP_FAILED) { 63 + host_virt = NULL; 64 + error_report("PVRDMA: Failed to remap memory for host_virt"); 65 + goto out_unmap_tbl; 66 + } 67 + 68 + rdma_pci_dma_unmap(pdev, curr_page, TARGET_PAGE_SIZE); 69 + 70 + pr_dbg("host_virt=%p\n", host_virt); 71 + 72 + dir_idx = 0; 73 + tbl_idx = 1; 74 + addr_idx = 1; 75 + while (addr_idx < nchunks) { 76 + if ((tbl_idx == (TARGET_PAGE_SIZE / sizeof(uint64_t)))) { 77 + tbl_idx = 0; 78 + dir_idx++; 79 + pr_dbg("Mapping to table %d\n", dir_idx); 80 + rdma_pci_dma_unmap(pdev, tbl, TARGET_PAGE_SIZE); 81 + tbl = rdma_pci_dma_map(pdev, dir[dir_idx], TARGET_PAGE_SIZE); 82 + if (!tbl) { 83 + error_report("PVRDMA: Failed to map to page table %d", dir_idx); 84 + goto out_unmap_host_virt; 85 + } 86 + } 87 + 88 + pr_dbg("guest_dma[%d]=0x%lx\n", addr_idx, tbl[tbl_idx]); 89 + 90 + curr_page = rdma_pci_dma_map(pdev, (dma_addr_t)tbl[tbl_idx], 91 + TARGET_PAGE_SIZE); 92 + if (!curr_page) { 93 + error_report("PVRDMA: Failed to map to page %d, dir %d", tbl_idx, 94 + dir_idx); 95 + goto out_unmap_host_virt; 96 + } 97 + 98 + mremap(curr_page, 0, TARGET_PAGE_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, 99 + host_virt + TARGET_PAGE_SIZE * addr_idx); 100 + 101 + rdma_pci_dma_unmap(pdev, curr_page, TARGET_PAGE_SIZE); 102 + 103 + addr_idx++; 104 + 105 + tbl_idx++; 106 + } 107 + 108 + goto out_unmap_tbl; 109 + 110 + out_unmap_host_virt: 111 + munmap(host_virt, length); 112 + host_virt = NULL; 113 + 114 + out_unmap_tbl: 115 + rdma_pci_dma_unmap(pdev, tbl, TARGET_PAGE_SIZE); 116 + 117 + out_unmap_dir: 118 + rdma_pci_dma_unmap(pdev, dir, TARGET_PAGE_SIZE); 119 + 120 + return host_virt; 121 + } 122 + 123 + static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req, 124 + union pvrdma_cmd_resp *rsp) 125 + { 126 + struct pvrdma_cmd_query_port *cmd = &req->query_port; 127 + struct pvrdma_cmd_query_port_resp *resp = &rsp->query_port_resp; 128 + struct pvrdma_port_attr attrs = {0}; 129 + 130 + pr_dbg("port=%d\n", cmd->port_num); 131 + 132 + if (rdma_backend_query_port(&dev->backend_dev, 133 + (struct ibv_port_attr *)&attrs)) { 134 + return -ENOMEM; 135 + } 136 + 137 + memset(resp, 0, sizeof(*resp)); 138 + resp->hdr.response = cmd->hdr.response; 139 + resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP; 140 + resp->hdr.err = 0; 141 + 142 + resp->attrs.state = attrs.state; 143 + resp->attrs.max_mtu = attrs.max_mtu; 144 + resp->attrs.active_mtu = attrs.active_mtu; 145 + resp->attrs.phys_state = attrs.phys_state; 146 + resp->attrs.gid_tbl_len = MIN(MAX_PORT_GIDS, attrs.gid_tbl_len); 147 + resp->attrs.max_msg_sz = 1024; 148 + resp->attrs.pkey_tbl_len = MIN(MAX_PORT_PKEYS, attrs.pkey_tbl_len); 149 + resp->attrs.active_width = 1; 150 + resp->attrs.active_speed = 1; 151 + 152 + return 0; 153 + } 154 + 155 + static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req, 156 + union pvrdma_cmd_resp *rsp) 157 + { 158 + struct pvrdma_cmd_query_pkey *cmd = &req->query_pkey; 159 + struct pvrdma_cmd_query_pkey_resp *resp = &rsp->query_pkey_resp; 160 + 161 + pr_dbg("port=%d\n", cmd->port_num); 162 + pr_dbg("index=%d\n", cmd->index); 163 + 164 + memset(resp, 0, sizeof(*resp)); 165 + resp->hdr.response = cmd->hdr.response; 166 + resp->hdr.ack = PVRDMA_CMD_QUERY_PKEY_RESP; 167 + resp->hdr.err = 0; 168 + 169 + resp->pkey = 0x7FFF; 170 + pr_dbg("pkey=0x%x\n", resp->pkey); 171 + 172 + return 0; 173 + } 174 + 175 + static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req, 176 + union pvrdma_cmd_resp *rsp) 177 + { 178 + struct pvrdma_cmd_create_pd *cmd = &req->create_pd; 179 + struct pvrdma_cmd_create_pd_resp *resp = &rsp->create_pd_resp; 180 + 181 + pr_dbg("context=0x%x\n", cmd->ctx_handle ? cmd->ctx_handle : 0); 182 + 183 + memset(resp, 0, sizeof(*resp)); 184 + resp->hdr.response = cmd->hdr.response; 185 + resp->hdr.ack = PVRDMA_CMD_CREATE_PD_RESP; 186 + resp->hdr.err = rdma_rm_alloc_pd(&dev->rdma_dev_res, &dev->backend_dev, 187 + &resp->pd_handle, cmd->ctx_handle); 188 + 189 + pr_dbg("ret=%d\n", resp->hdr.err); 190 + return resp->hdr.err; 191 + } 192 + 193 + static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req *req, 194 + union pvrdma_cmd_resp *rsp) 195 + { 196 + struct pvrdma_cmd_destroy_pd *cmd = &req->destroy_pd; 197 + 198 + pr_dbg("pd_handle=%d\n", cmd->pd_handle); 199 + 200 + rdma_rm_dealloc_pd(&dev->rdma_dev_res, cmd->pd_handle); 201 + 202 + return 0; 203 + } 204 + 205 + static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req, 206 + union pvrdma_cmd_resp *rsp) 207 + { 208 + struct pvrdma_cmd_create_mr *cmd = &req->create_mr; 209 + struct pvrdma_cmd_create_mr_resp *resp = &rsp->create_mr_resp; 210 + PCIDevice *pci_dev = PCI_DEVICE(dev); 211 + void *host_virt = NULL; 212 + 213 + memset(resp, 0, sizeof(*resp)); 214 + resp->hdr.response = cmd->hdr.response; 215 + resp->hdr.ack = PVRDMA_CMD_CREATE_MR_RESP; 216 + 217 + pr_dbg("pd_handle=%d\n", cmd->pd_handle); 218 + pr_dbg("access_flags=0x%x\n", cmd->access_flags); 219 + pr_dbg("flags=0x%x\n", cmd->flags); 220 + 221 + if (!(cmd->flags & PVRDMA_MR_FLAG_DMA)) { 222 + host_virt = pvrdma_map_to_pdir(pci_dev, cmd->pdir_dma, cmd->nchunks, 223 + cmd->length); 224 + if (!host_virt) { 225 + pr_dbg("Failed to map to pdir\n"); 226 + resp->hdr.err = -EINVAL; 227 + goto out; 228 + } 229 + } 230 + 231 + resp->hdr.err = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle, 232 + cmd->start, cmd->length, host_virt, 233 + cmd->access_flags, &resp->mr_handle, 234 + &resp->lkey, &resp->rkey); 235 + if (!resp->hdr.err) { 236 + munmap(host_virt, cmd->length); 237 + } 238 + 239 + out: 240 + pr_dbg("ret=%d\n", resp->hdr.err); 241 + return resp->hdr.err; 242 + } 243 + 244 + static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req *req, 245 + union pvrdma_cmd_resp *rsp) 246 + { 247 + struct pvrdma_cmd_destroy_mr *cmd = &req->destroy_mr; 248 + 249 + pr_dbg("mr_handle=%d\n", cmd->mr_handle); 250 + 251 + rdma_rm_dealloc_mr(&dev->rdma_dev_res, cmd->mr_handle); 252 + 253 + return 0; 254 + } 255 + 256 + static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring, 257 + uint64_t pdir_dma, uint32_t nchunks, uint32_t cqe) 258 + { 259 + uint64_t *dir = NULL, *tbl = NULL; 260 + PvrdmaRing *r; 261 + int rc = -EINVAL; 262 + char ring_name[MAX_RING_NAME_SZ]; 263 + 264 + pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma); 265 + dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE); 266 + if (!dir) { 267 + pr_dbg("Failed to map to CQ page directory\n"); 268 + goto out; 269 + } 270 + 271 + tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE); 272 + if (!tbl) { 273 + pr_dbg("Failed to map to CQ page table\n"); 274 + goto out; 275 + } 276 + 277 + r = g_malloc(sizeof(*r)); 278 + *ring = r; 279 + 280 + r->ring_state = (struct pvrdma_ring *) 281 + rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE); 282 + 283 + if (!r->ring_state) { 284 + pr_dbg("Failed to map to CQ ring state\n"); 285 + goto out_free_ring; 286 + } 287 + 288 + sprintf(ring_name, "cq_ring_%lx", pdir_dma); 289 + rc = pvrdma_ring_init(r, ring_name, pci_dev, &r->ring_state[1], 290 + cqe, sizeof(struct pvrdma_cqe), 291 + /* first page is ring state */ 292 + (dma_addr_t *)&tbl[1], nchunks - 1); 293 + if (rc) { 294 + goto out_unmap_ring_state; 295 + } 296 + 297 + goto out; 298 + 299 + out_unmap_ring_state: 300 + /* ring_state was in slot 1, not 0 so need to jump back */ 301 + rdma_pci_dma_unmap(pci_dev, --r->ring_state, TARGET_PAGE_SIZE); 302 + 303 + out_free_ring: 304 + g_free(r); 305 + 306 + out: 307 + rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE); 308 + rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE); 309 + 310 + return rc; 311 + } 312 + 313 + static int create_cq(PVRDMADev *dev, union pvrdma_cmd_req *req, 314 + union pvrdma_cmd_resp *rsp) 315 + { 316 + struct pvrdma_cmd_create_cq *cmd = &req->create_cq; 317 + struct pvrdma_cmd_create_cq_resp *resp = &rsp->create_cq_resp; 318 + PvrdmaRing *ring = NULL; 319 + 320 + memset(resp, 0, sizeof(*resp)); 321 + resp->hdr.response = cmd->hdr.response; 322 + resp->hdr.ack = PVRDMA_CMD_CREATE_CQ_RESP; 323 + 324 + resp->cqe = cmd->cqe; 325 + 326 + resp->hdr.err = create_cq_ring(PCI_DEVICE(dev), &ring, cmd->pdir_dma, 327 + cmd->nchunks, cmd->cqe); 328 + if (resp->hdr.err) { 329 + goto out; 330 + } 331 + 332 + pr_dbg("ring=%p\n", ring); 333 + 334 + resp->hdr.err = rdma_rm_alloc_cq(&dev->rdma_dev_res, &dev->backend_dev, 335 + cmd->cqe, &resp->cq_handle, ring); 336 + resp->cqe = cmd->cqe; 337 + 338 + out: 339 + pr_dbg("ret=%d\n", resp->hdr.err); 340 + return resp->hdr.err; 341 + } 342 + 343 + static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req, 344 + union pvrdma_cmd_resp *rsp) 345 + { 346 + struct pvrdma_cmd_destroy_cq *cmd = &req->destroy_cq; 347 + RdmaRmCQ *cq; 348 + PvrdmaRing *ring; 349 + 350 + pr_dbg("cq_handle=%d\n", cmd->cq_handle); 351 + 352 + cq = rdma_rm_get_cq(&dev->rdma_dev_res, cmd->cq_handle); 353 + if (!cq) { 354 + pr_dbg("Invalid CQ handle\n"); 355 + return -EINVAL; 356 + } 357 + 358 + ring = (PvrdmaRing *)cq->opaque; 359 + pvrdma_ring_free(ring); 360 + /* ring_state was in slot 1, not 0 so need to jump back */ 361 + rdma_pci_dma_unmap(PCI_DEVICE(dev), --ring->ring_state, TARGET_PAGE_SIZE); 362 + g_free(ring); 363 + 364 + rdma_rm_dealloc_cq(&dev->rdma_dev_res, cmd->cq_handle); 365 + 366 + return 0; 367 + } 368 + 369 + static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma, 370 + PvrdmaRing **rings, uint32_t scqe, uint32_t smax_sge, 371 + uint32_t spages, uint32_t rcqe, uint32_t rmax_sge, 372 + uint32_t rpages) 373 + { 374 + uint64_t *dir = NULL, *tbl = NULL; 375 + PvrdmaRing *sr, *rr; 376 + int rc = -EINVAL; 377 + char ring_name[MAX_RING_NAME_SZ]; 378 + uint32_t wqe_sz; 379 + 380 + pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma); 381 + dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE); 382 + if (!dir) { 383 + pr_dbg("Failed to map to CQ page directory\n"); 384 + goto out; 385 + } 386 + 387 + tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE); 388 + if (!tbl) { 389 + pr_dbg("Failed to map to CQ page table\n"); 390 + goto out; 391 + } 392 + 393 + sr = g_malloc(2 * sizeof(*rr)); 394 + rr = &sr[1]; 395 + pr_dbg("sring=%p\n", sr); 396 + pr_dbg("rring=%p\n", rr); 397 + 398 + *rings = sr; 399 + 400 + pr_dbg("scqe=%d\n", scqe); 401 + pr_dbg("smax_sge=%d\n", smax_sge); 402 + pr_dbg("spages=%d\n", spages); 403 + pr_dbg("rcqe=%d\n", rcqe); 404 + pr_dbg("rmax_sge=%d\n", rmax_sge); 405 + pr_dbg("rpages=%d\n", rpages); 406 + 407 + /* Create send ring */ 408 + sr->ring_state = (struct pvrdma_ring *) 409 + rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE); 410 + if (!sr->ring_state) { 411 + pr_dbg("Failed to map to CQ ring state\n"); 412 + goto out_free_sr_mem; 413 + } 414 + 415 + wqe_sz = pow2ceil(sizeof(struct pvrdma_sq_wqe_hdr) + 416 + sizeof(struct pvrdma_sge) * smax_sge - 1); 417 + 418 + sprintf(ring_name, "qp_sring_%lx", pdir_dma); 419 + rc = pvrdma_ring_init(sr, ring_name, pci_dev, sr->ring_state, 420 + scqe, wqe_sz, (dma_addr_t *)&tbl[1], spages); 421 + if (rc) { 422 + goto out_unmap_ring_state; 423 + } 424 + 425 + /* Create recv ring */ 426 + rr->ring_state = &sr->ring_state[1]; 427 + wqe_sz = pow2ceil(sizeof(struct pvrdma_rq_wqe_hdr) + 428 + sizeof(struct pvrdma_sge) * rmax_sge - 1); 429 + sprintf(ring_name, "qp_rring_%lx", pdir_dma); 430 + rc = pvrdma_ring_init(rr, ring_name, pci_dev, rr->ring_state, 431 + rcqe, wqe_sz, (dma_addr_t *)&tbl[1 + spages], rpages); 432 + if (rc) { 433 + goto out_free_sr; 434 + } 435 + 436 + goto out; 437 + 438 + out_free_sr: 439 + pvrdma_ring_free(sr); 440 + 441 + out_unmap_ring_state: 442 + rdma_pci_dma_unmap(pci_dev, sr->ring_state, TARGET_PAGE_SIZE); 443 + 444 + out_free_sr_mem: 445 + g_free(sr); 446 + 447 + out: 448 + rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE); 449 + rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE); 450 + 451 + return rc; 452 + } 453 + 454 + static int create_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, 455 + union pvrdma_cmd_resp *rsp) 456 + { 457 + struct pvrdma_cmd_create_qp *cmd = &req->create_qp; 458 + struct pvrdma_cmd_create_qp_resp *resp = &rsp->create_qp_resp; 459 + PvrdmaRing *rings = NULL; 460 + 461 + memset(resp, 0, sizeof(*resp)); 462 + resp->hdr.response = cmd->hdr.response; 463 + resp->hdr.ack = PVRDMA_CMD_CREATE_QP_RESP; 464 + 465 + pr_dbg("total_chunks=%d\n", cmd->total_chunks); 466 + pr_dbg("send_chunks=%d\n", cmd->send_chunks); 467 + 468 + resp->hdr.err = create_qp_rings(PCI_DEVICE(dev), cmd->pdir_dma, &rings, 469 + cmd->max_send_wr, cmd->max_send_sge, 470 + cmd->send_chunks, cmd->max_recv_wr, 471 + cmd->max_recv_sge, cmd->total_chunks - 472 + cmd->send_chunks - 1); 473 + if (resp->hdr.err) { 474 + goto out; 475 + } 476 + 477 + pr_dbg("rings=%p\n", rings); 478 + 479 + resp->hdr.err = rdma_rm_alloc_qp(&dev->rdma_dev_res, cmd->pd_handle, 480 + cmd->qp_type, cmd->max_send_wr, 481 + cmd->max_send_sge, cmd->send_cq_handle, 482 + cmd->max_recv_wr, cmd->max_recv_sge, 483 + cmd->recv_cq_handle, rings, &resp->qpn); 484 + 485 + resp->max_send_wr = cmd->max_send_wr; 486 + resp->max_recv_wr = cmd->max_recv_wr; 487 + resp->max_send_sge = cmd->max_send_sge; 488 + resp->max_recv_sge = cmd->max_recv_sge; 489 + resp->max_inline_data = cmd->max_inline_data; 490 + 491 + out: 492 + pr_dbg("ret=%d\n", resp->hdr.err); 493 + return resp->hdr.err; 494 + } 495 + 496 + static int modify_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, 497 + union pvrdma_cmd_resp *rsp) 498 + { 499 + struct pvrdma_cmd_modify_qp *cmd = &req->modify_qp; 500 + 501 + pr_dbg("qp_handle=%d\n", cmd->qp_handle); 502 + 503 + memset(rsp, 0, sizeof(*rsp)); 504 + rsp->hdr.response = cmd->hdr.response; 505 + rsp->hdr.ack = PVRDMA_CMD_MODIFY_QP_RESP; 506 + 507 + rsp->hdr.err = rdma_rm_modify_qp(&dev->rdma_dev_res, &dev->backend_dev, 508 + cmd->qp_handle, cmd->attr_mask, 509 + (union ibv_gid *)&cmd->attrs.ah_attr.grh.dgid, 510 + cmd->attrs.dest_qp_num, cmd->attrs.qp_state, 511 + cmd->attrs.qkey, cmd->attrs.rq_psn, 512 + cmd->attrs.sq_psn); 513 + 514 + pr_dbg("ret=%d\n", rsp->hdr.err); 515 + return rsp->hdr.err; 516 + } 517 + 518 + static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, 519 + union pvrdma_cmd_resp *rsp) 520 + { 521 + struct pvrdma_cmd_destroy_qp *cmd = &req->destroy_qp; 522 + RdmaRmQP *qp; 523 + PvrdmaRing *ring; 524 + 525 + qp = rdma_rm_get_qp(&dev->rdma_dev_res, cmd->qp_handle); 526 + if (!qp) { 527 + pr_dbg("Invalid QP handle\n"); 528 + return -EINVAL; 529 + } 530 + 531 + rdma_rm_dealloc_qp(&dev->rdma_dev_res, cmd->qp_handle); 532 + 533 + ring = (PvrdmaRing *)qp->opaque; 534 + pr_dbg("sring=%p\n", &ring[0]); 535 + pvrdma_ring_free(&ring[0]); 536 + pr_dbg("rring=%p\n", &ring[1]); 537 + pvrdma_ring_free(&ring[1]); 538 + 539 + rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE); 540 + g_free(ring); 541 + 542 + return 0; 543 + } 544 + 545 + static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, 546 + union pvrdma_cmd_resp *rsp) 547 + { 548 + struct pvrdma_cmd_create_bind *cmd = &req->create_bind; 549 + #ifdef PVRDMA_DEBUG 550 + __be64 *subnet = (__be64 *)&cmd->new_gid[0]; 551 + __be64 *if_id = (__be64 *)&cmd->new_gid[8]; 552 + #endif 553 + 554 + pr_dbg("index=%d\n", cmd->index); 555 + 556 + if (cmd->index > MAX_PORT_GIDS) { 557 + return -EINVAL; 558 + } 559 + 560 + pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index, 561 + (long long unsigned int)be64_to_cpu(*subnet), 562 + (long long unsigned int)be64_to_cpu(*if_id)); 563 + 564 + /* Driver forces to one port only */ 565 + memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid, 566 + sizeof(cmd->new_gid)); 567 + 568 + /* TODO: Since drivers stores node_guid at load_dsr phase then this 569 + * assignment is not relevant, i need to figure out a way how to 570 + * retrieve MAC of our netdev */ 571 + dev->node_guid = dev->rdma_dev_res.ports[0].gid_tbl[0].global.interface_id; 572 + pr_dbg("dev->node_guid=0x%llx\n", 573 + (long long unsigned int)be64_to_cpu(dev->node_guid)); 574 + 575 + return 0; 576 + } 577 + 578 + static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, 579 + union pvrdma_cmd_resp *rsp) 580 + { 581 + struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind; 582 + 583 + pr_dbg("clear index %d\n", cmd->index); 584 + 585 + memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0, 586 + sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw)); 587 + 588 + return 0; 589 + } 590 + 591 + static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req, 592 + union pvrdma_cmd_resp *rsp) 593 + { 594 + struct pvrdma_cmd_create_uc *cmd = &req->create_uc; 595 + struct pvrdma_cmd_create_uc_resp *resp = &rsp->create_uc_resp; 596 + 597 + pr_dbg("pfn=%d\n", cmd->pfn); 598 + 599 + memset(resp, 0, sizeof(*resp)); 600 + resp->hdr.response = cmd->hdr.response; 601 + resp->hdr.ack = PVRDMA_CMD_CREATE_UC_RESP; 602 + resp->hdr.err = rdma_rm_alloc_uc(&dev->rdma_dev_res, cmd->pfn, 603 + &resp->ctx_handle); 604 + 605 + pr_dbg("ret=%d\n", resp->hdr.err); 606 + 607 + return 0; 608 + } 609 + 610 + static int destroy_uc(PVRDMADev *dev, union pvrdma_cmd_req *req, 611 + union pvrdma_cmd_resp *rsp) 612 + { 613 + struct pvrdma_cmd_destroy_uc *cmd = &req->destroy_uc; 614 + 615 + pr_dbg("ctx_handle=%d\n", cmd->ctx_handle); 616 + 617 + rdma_rm_dealloc_uc(&dev->rdma_dev_res, cmd->ctx_handle); 618 + 619 + return 0; 620 + } 621 + struct cmd_handler { 622 + uint32_t cmd; 623 + int (*exec)(PVRDMADev *dev, union pvrdma_cmd_req *req, 624 + union pvrdma_cmd_resp *rsp); 625 + }; 626 + 627 + static struct cmd_handler cmd_handlers[] = { 628 + {PVRDMA_CMD_QUERY_PORT, query_port}, 629 + {PVRDMA_CMD_QUERY_PKEY, query_pkey}, 630 + {PVRDMA_CMD_CREATE_PD, create_pd}, 631 + {PVRDMA_CMD_DESTROY_PD, destroy_pd}, 632 + {PVRDMA_CMD_CREATE_MR, create_mr}, 633 + {PVRDMA_CMD_DESTROY_MR, destroy_mr}, 634 + {PVRDMA_CMD_CREATE_CQ, create_cq}, 635 + {PVRDMA_CMD_RESIZE_CQ, NULL}, 636 + {PVRDMA_CMD_DESTROY_CQ, destroy_cq}, 637 + {PVRDMA_CMD_CREATE_QP, create_qp}, 638 + {PVRDMA_CMD_MODIFY_QP, modify_qp}, 639 + {PVRDMA_CMD_QUERY_QP, NULL}, 640 + {PVRDMA_CMD_DESTROY_QP, destroy_qp}, 641 + {PVRDMA_CMD_CREATE_UC, create_uc}, 642 + {PVRDMA_CMD_DESTROY_UC, destroy_uc}, 643 + {PVRDMA_CMD_CREATE_BIND, create_bind}, 644 + {PVRDMA_CMD_DESTROY_BIND, destroy_bind}, 645 + }; 646 + 647 + int execute_command(PVRDMADev *dev) 648 + { 649 + int err = 0xFFFF; 650 + DSRInfo *dsr_info; 651 + 652 + dsr_info = &dev->dsr_info; 653 + 654 + pr_dbg("cmd=%d\n", dsr_info->req->hdr.cmd); 655 + if (dsr_info->req->hdr.cmd >= sizeof(cmd_handlers) / 656 + sizeof(struct cmd_handler)) { 657 + pr_dbg("Unsupported command\n"); 658 + goto out; 659 + } 660 + 661 + if (!cmd_handlers[dsr_info->req->hdr.cmd].exec) { 662 + pr_dbg("Unsupported command (not implemented yet)\n"); 663 + goto out; 664 + } 665 + 666 + err = cmd_handlers[dsr_info->req->hdr.cmd].exec(dev, dsr_info->req, 667 + dsr_info->rsp); 668 + out: 669 + set_reg_val(dev, PVRDMA_REG_ERR, err); 670 + post_interrupt(dev, INTR_VEC_CMD_RING); 671 + 672 + return (err == 0) ? 0 : -EINVAL; 673 + }
+155
hw/rdma/vmw/pvrdma_dev_ring.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - Device rings 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + #include <hw/pci/pci.h> 18 + #include <cpu.h> 19 + 20 + #include "../rdma_utils.h" 21 + #include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h> 22 + #include "pvrdma_dev_ring.h" 23 + 24 + int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev, 25 + struct pvrdma_ring *ring_state, uint32_t max_elems, 26 + size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages) 27 + { 28 + int i; 29 + int rc = 0; 30 + 31 + strncpy(ring->name, name, MAX_RING_NAME_SZ); 32 + ring->name[MAX_RING_NAME_SZ - 1] = 0; 33 + pr_dbg("Initializing %s ring\n", ring->name); 34 + ring->dev = dev; 35 + ring->ring_state = ring_state; 36 + ring->max_elems = max_elems; 37 + ring->elem_sz = elem_sz; 38 + pr_dbg("ring->elem_sz=%ld\n", ring->elem_sz); 39 + pr_dbg("npages=%ld\n", npages); 40 + /* TODO: Give a moment to think if we want to redo driver settings 41 + atomic_set(&ring->ring_state->prod_tail, 0); 42 + atomic_set(&ring->ring_state->cons_head, 0); 43 + */ 44 + ring->npages = npages; 45 + ring->pages = g_malloc(npages * sizeof(void *)); 46 + 47 + for (i = 0; i < npages; i++) { 48 + if (!tbl[i]) { 49 + pr_err("npages=%ld but tbl[%d] is NULL\n", (long)npages, i); 50 + continue; 51 + } 52 + 53 + ring->pages[i] = rdma_pci_dma_map(dev, tbl[i], TARGET_PAGE_SIZE); 54 + if (!ring->pages[i]) { 55 + rc = -ENOMEM; 56 + pr_dbg("Failed to map to page %d\n", i); 57 + goto out_free; 58 + } 59 + memset(ring->pages[i], 0, TARGET_PAGE_SIZE); 60 + } 61 + 62 + goto out; 63 + 64 + out_free: 65 + while (i--) { 66 + rdma_pci_dma_unmap(dev, ring->pages[i], TARGET_PAGE_SIZE); 67 + } 68 + g_free(ring->pages); 69 + 70 + out: 71 + return rc; 72 + } 73 + 74 + void *pvrdma_ring_next_elem_read(PvrdmaRing *ring) 75 + { 76 + unsigned int idx = 0, offset; 77 + 78 + /* 79 + pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail, 80 + ring->ring_state->cons_head); 81 + */ 82 + 83 + if (!pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx)) { 84 + pr_dbg("No more data in ring\n"); 85 + return NULL; 86 + } 87 + 88 + offset = idx * ring->elem_sz; 89 + /* 90 + pr_dbg("idx=%d\n", idx); 91 + pr_dbg("offset=%d\n", offset); 92 + */ 93 + return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE); 94 + } 95 + 96 + void pvrdma_ring_read_inc(PvrdmaRing *ring) 97 + { 98 + pvrdma_idx_ring_inc(&ring->ring_state->cons_head, ring->max_elems); 99 + /* 100 + pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name, 101 + ring->ring_state->prod_tail, ring->ring_state->cons_head, 102 + ring->max_elems); 103 + */ 104 + } 105 + 106 + void *pvrdma_ring_next_elem_write(PvrdmaRing *ring) 107 + { 108 + unsigned int idx, offset, tail; 109 + 110 + /* 111 + pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail, 112 + ring->ring_state->cons_head); 113 + */ 114 + 115 + if (!pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail)) { 116 + pr_dbg("CQ is full\n"); 117 + return NULL; 118 + } 119 + 120 + idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems); 121 + /* TODO: tail == idx */ 122 + 123 + offset = idx * ring->elem_sz; 124 + return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE); 125 + } 126 + 127 + void pvrdma_ring_write_inc(PvrdmaRing *ring) 128 + { 129 + pvrdma_idx_ring_inc(&ring->ring_state->prod_tail, ring->max_elems); 130 + /* 131 + pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name, 132 + ring->ring_state->prod_tail, ring->ring_state->cons_head, 133 + ring->max_elems); 134 + */ 135 + } 136 + 137 + void pvrdma_ring_free(PvrdmaRing *ring) 138 + { 139 + if (!ring) { 140 + return; 141 + } 142 + 143 + if (!ring->pages) { 144 + return; 145 + } 146 + 147 + pr_dbg("ring->npages=%d\n", ring->npages); 148 + while (ring->npages--) { 149 + rdma_pci_dma_unmap(ring->dev, ring->pages[ring->npages], 150 + TARGET_PAGE_SIZE); 151 + } 152 + 153 + g_free(ring->pages); 154 + ring->pages = NULL; 155 + }
+42
hw/rdma/vmw/pvrdma_dev_ring.h
··· 1 + /* 2 + * QEMU VMWARE paravirtual RDMA ring utilities 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef PVRDMA_DEV_RING_H 17 + #define PVRDMA_DEV_RING_H 18 + 19 + #include <qemu/typedefs.h> 20 + 21 + #define MAX_RING_NAME_SZ 32 22 + 23 + typedef struct PvrdmaRing { 24 + char name[MAX_RING_NAME_SZ]; 25 + PCIDevice *dev; 26 + uint32_t max_elems; 27 + size_t elem_sz; 28 + struct pvrdma_ring *ring_state; /* used only for unmap */ 29 + int npages; 30 + void **pages; 31 + } PvrdmaRing; 32 + 33 + int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev, 34 + struct pvrdma_ring *ring_state, uint32_t max_elems, 35 + size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages); 36 + void *pvrdma_ring_next_elem_read(PvrdmaRing *ring); 37 + void pvrdma_ring_read_inc(PvrdmaRing *ring); 38 + void *pvrdma_ring_next_elem_write(PvrdmaRing *ring); 39 + void pvrdma_ring_write_inc(PvrdmaRing *ring); 40 + void pvrdma_ring_free(PvrdmaRing *ring); 41 + 42 + #endif
+670
hw/rdma/vmw/pvrdma_main.c
··· 1 + /* 2 + * QEMU paravirtual RDMA 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + #include <qapi/error.h> 18 + #include <hw/hw.h> 19 + #include <hw/pci/pci.h> 20 + #include <hw/pci/pci_ids.h> 21 + #include <hw/pci/msi.h> 22 + #include <hw/pci/msix.h> 23 + #include <hw/qdev-core.h> 24 + #include <hw/qdev-properties.h> 25 + #include <cpu.h> 26 + #include "trace.h" 27 + 28 + #include "../rdma_rm.h" 29 + #include "../rdma_backend.h" 30 + #include "../rdma_utils.h" 31 + 32 + #include <infiniband/verbs.h> 33 + #include "pvrdma.h" 34 + #include <standard-headers/rdma/vmw_pvrdma-abi.h> 35 + #include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h> 36 + #include "pvrdma_qp_ops.h" 37 + 38 + static Property pvrdma_dev_properties[] = { 39 + DEFINE_PROP_STRING("backend-dev", PVRDMADev, backend_device_name), 40 + DEFINE_PROP_UINT8("backend-port", PVRDMADev, backend_port_num, 1), 41 + DEFINE_PROP_UINT8("backend-gid-idx", PVRDMADev, backend_gid_idx, 0), 42 + DEFINE_PROP_UINT64("dev-caps-max-mr-size", PVRDMADev, dev_attr.max_mr_size, 43 + MAX_MR_SIZE), 44 + DEFINE_PROP_INT32("dev-caps-max-qp", PVRDMADev, dev_attr.max_qp, MAX_QP), 45 + DEFINE_PROP_INT32("dev-caps-max-sge", PVRDMADev, dev_attr.max_sge, MAX_SGE), 46 + DEFINE_PROP_INT32("dev-caps-max-cq", PVRDMADev, dev_attr.max_cq, MAX_CQ), 47 + DEFINE_PROP_INT32("dev-caps-max-mr", PVRDMADev, dev_attr.max_mr, MAX_MR), 48 + DEFINE_PROP_INT32("dev-caps-max-pd", PVRDMADev, dev_attr.max_pd, MAX_PD), 49 + DEFINE_PROP_INT32("dev-caps-qp-rd-atom", PVRDMADev, dev_attr.max_qp_rd_atom, 50 + MAX_QP_RD_ATOM), 51 + DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev, 52 + dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM), 53 + DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH), 54 + DEFINE_PROP_END_OF_LIST(), 55 + }; 56 + 57 + static void free_dev_ring(PCIDevice *pci_dev, PvrdmaRing *ring, 58 + void *ring_state) 59 + { 60 + pvrdma_ring_free(ring); 61 + rdma_pci_dma_unmap(pci_dev, ring_state, TARGET_PAGE_SIZE); 62 + } 63 + 64 + static int init_dev_ring(PvrdmaRing *ring, struct pvrdma_ring **ring_state, 65 + const char *name, PCIDevice *pci_dev, 66 + dma_addr_t dir_addr, uint32_t num_pages) 67 + { 68 + uint64_t *dir, *tbl; 69 + int rc = 0; 70 + 71 + pr_dbg("Initializing device ring %s\n", name); 72 + pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)dir_addr); 73 + pr_dbg("num_pages=%d\n", num_pages); 74 + dir = rdma_pci_dma_map(pci_dev, dir_addr, TARGET_PAGE_SIZE); 75 + if (!dir) { 76 + pr_err("Failed to map to page directory\n"); 77 + rc = -ENOMEM; 78 + goto out; 79 + } 80 + tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE); 81 + if (!tbl) { 82 + pr_err("Failed to map to page table\n"); 83 + rc = -ENOMEM; 84 + goto out_free_dir; 85 + } 86 + 87 + *ring_state = rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE); 88 + if (!*ring_state) { 89 + pr_err("Failed to map to ring state\n"); 90 + rc = -ENOMEM; 91 + goto out_free_tbl; 92 + } 93 + /* RX ring is the second */ 94 + (struct pvrdma_ring *)(*ring_state)++; 95 + rc = pvrdma_ring_init(ring, name, pci_dev, 96 + (struct pvrdma_ring *)*ring_state, 97 + (num_pages - 1) * TARGET_PAGE_SIZE / 98 + sizeof(struct pvrdma_cqne), 99 + sizeof(struct pvrdma_cqne), 100 + (dma_addr_t *)&tbl[1], (dma_addr_t)num_pages - 1); 101 + if (rc) { 102 + pr_err("Failed to initialize ring\n"); 103 + rc = -ENOMEM; 104 + goto out_free_ring_state; 105 + } 106 + 107 + goto out_free_tbl; 108 + 109 + out_free_ring_state: 110 + rdma_pci_dma_unmap(pci_dev, *ring_state, TARGET_PAGE_SIZE); 111 + 112 + out_free_tbl: 113 + rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE); 114 + 115 + out_free_dir: 116 + rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE); 117 + 118 + out: 119 + return rc; 120 + } 121 + 122 + static void free_dsr(PVRDMADev *dev) 123 + { 124 + PCIDevice *pci_dev = PCI_DEVICE(dev); 125 + 126 + if (!dev->dsr_info.dsr) { 127 + return; 128 + } 129 + 130 + free_dev_ring(pci_dev, &dev->dsr_info.async, 131 + dev->dsr_info.async_ring_state); 132 + 133 + free_dev_ring(pci_dev, &dev->dsr_info.cq, dev->dsr_info.cq_ring_state); 134 + 135 + rdma_pci_dma_unmap(pci_dev, dev->dsr_info.req, 136 + sizeof(union pvrdma_cmd_req)); 137 + 138 + rdma_pci_dma_unmap(pci_dev, dev->dsr_info.rsp, 139 + sizeof(union pvrdma_cmd_resp)); 140 + 141 + rdma_pci_dma_unmap(pci_dev, dev->dsr_info.dsr, 142 + sizeof(struct pvrdma_device_shared_region)); 143 + 144 + dev->dsr_info.dsr = NULL; 145 + } 146 + 147 + static int load_dsr(PVRDMADev *dev) 148 + { 149 + int rc = 0; 150 + PCIDevice *pci_dev = PCI_DEVICE(dev); 151 + DSRInfo *dsr_info; 152 + struct pvrdma_device_shared_region *dsr; 153 + 154 + free_dsr(dev); 155 + 156 + /* Map to DSR */ 157 + pr_dbg("dsr_dma=0x%llx\n", (long long unsigned int)dev->dsr_info.dma); 158 + dev->dsr_info.dsr = rdma_pci_dma_map(pci_dev, dev->dsr_info.dma, 159 + sizeof(struct pvrdma_device_shared_region)); 160 + if (!dev->dsr_info.dsr) { 161 + pr_err("Failed to map to DSR\n"); 162 + rc = -ENOMEM; 163 + goto out; 164 + } 165 + 166 + /* Shortcuts */ 167 + dsr_info = &dev->dsr_info; 168 + dsr = dsr_info->dsr; 169 + 170 + /* Map to command slot */ 171 + pr_dbg("cmd_dma=0x%llx\n", (long long unsigned int)dsr->cmd_slot_dma); 172 + dsr_info->req = rdma_pci_dma_map(pci_dev, dsr->cmd_slot_dma, 173 + sizeof(union pvrdma_cmd_req)); 174 + if (!dsr_info->req) { 175 + pr_err("Failed to map to command slot address\n"); 176 + rc = -ENOMEM; 177 + goto out_free_dsr; 178 + } 179 + 180 + /* Map to response slot */ 181 + pr_dbg("rsp_dma=0x%llx\n", (long long unsigned int)dsr->resp_slot_dma); 182 + dsr_info->rsp = rdma_pci_dma_map(pci_dev, dsr->resp_slot_dma, 183 + sizeof(union pvrdma_cmd_resp)); 184 + if (!dsr_info->rsp) { 185 + pr_err("Failed to map to response slot address\n"); 186 + rc = -ENOMEM; 187 + goto out_free_req; 188 + } 189 + 190 + /* Map to CQ notification ring */ 191 + rc = init_dev_ring(&dsr_info->cq, &dsr_info->cq_ring_state, "dev_cq", 192 + pci_dev, dsr->cq_ring_pages.pdir_dma, 193 + dsr->cq_ring_pages.num_pages); 194 + if (rc) { 195 + pr_err("Failed to map to initialize CQ ring\n"); 196 + rc = -ENOMEM; 197 + goto out_free_rsp; 198 + } 199 + 200 + /* Map to event notification ring */ 201 + rc = init_dev_ring(&dsr_info->async, &dsr_info->async_ring_state, 202 + "dev_async", pci_dev, dsr->async_ring_pages.pdir_dma, 203 + dsr->async_ring_pages.num_pages); 204 + if (rc) { 205 + pr_err("Failed to map to initialize event ring\n"); 206 + rc = -ENOMEM; 207 + goto out_free_rsp; 208 + } 209 + 210 + goto out; 211 + 212 + out_free_rsp: 213 + rdma_pci_dma_unmap(pci_dev, dsr_info->rsp, sizeof(union pvrdma_cmd_resp)); 214 + 215 + out_free_req: 216 + rdma_pci_dma_unmap(pci_dev, dsr_info->req, sizeof(union pvrdma_cmd_req)); 217 + 218 + out_free_dsr: 219 + rdma_pci_dma_unmap(pci_dev, dsr_info->dsr, 220 + sizeof(struct pvrdma_device_shared_region)); 221 + dsr_info->dsr = NULL; 222 + 223 + out: 224 + return rc; 225 + } 226 + 227 + static void init_dsr_dev_caps(PVRDMADev *dev) 228 + { 229 + struct pvrdma_device_shared_region *dsr; 230 + 231 + if (dev->dsr_info.dsr == NULL) { 232 + pr_err("Can't initialized DSR\n"); 233 + return; 234 + } 235 + 236 + dsr = dev->dsr_info.dsr; 237 + 238 + dsr->caps.fw_ver = PVRDMA_FW_VERSION; 239 + pr_dbg("fw_ver=0x%lx\n", dsr->caps.fw_ver); 240 + 241 + dsr->caps.mode = PVRDMA_DEVICE_MODE_ROCE; 242 + pr_dbg("mode=%d\n", dsr->caps.mode); 243 + 244 + dsr->caps.gid_types |= PVRDMA_GID_TYPE_FLAG_ROCE_V1; 245 + pr_dbg("gid_types=0x%x\n", dsr->caps.gid_types); 246 + 247 + dsr->caps.max_uar = RDMA_BAR2_UAR_SIZE; 248 + pr_dbg("max_uar=%d\n", dsr->caps.max_uar); 249 + 250 + dsr->caps.max_mr_size = dev->dev_attr.max_mr_size; 251 + dsr->caps.max_qp = dev->dev_attr.max_qp; 252 + dsr->caps.max_qp_wr = dev->dev_attr.max_qp_wr; 253 + dsr->caps.max_sge = dev->dev_attr.max_sge; 254 + dsr->caps.max_cq = dev->dev_attr.max_cq; 255 + dsr->caps.max_cqe = dev->dev_attr.max_cqe; 256 + dsr->caps.max_mr = dev->dev_attr.max_mr; 257 + dsr->caps.max_pd = dev->dev_attr.max_pd; 258 + dsr->caps.max_ah = dev->dev_attr.max_ah; 259 + 260 + dsr->caps.gid_tbl_len = MAX_GIDS; 261 + pr_dbg("gid_tbl_len=%d\n", dsr->caps.gid_tbl_len); 262 + 263 + dsr->caps.sys_image_guid = 0; 264 + pr_dbg("sys_image_guid=%lx\n", dsr->caps.sys_image_guid); 265 + 266 + dsr->caps.node_guid = cpu_to_be64(dev->node_guid); 267 + pr_dbg("node_guid=%llx\n", 268 + (long long unsigned int)be64_to_cpu(dsr->caps.node_guid)); 269 + 270 + dsr->caps.phys_port_cnt = MAX_PORTS; 271 + pr_dbg("phys_port_cnt=%d\n", dsr->caps.phys_port_cnt); 272 + 273 + dsr->caps.max_pkeys = MAX_PKEYS; 274 + pr_dbg("max_pkeys=%d\n", dsr->caps.max_pkeys); 275 + 276 + pr_dbg("Initialized\n"); 277 + } 278 + 279 + static void free_ports(PVRDMADev *dev) 280 + { 281 + int i; 282 + 283 + for (i = 0; i < MAX_PORTS; i++) { 284 + g_free(dev->rdma_dev_res.ports[i].gid_tbl); 285 + } 286 + } 287 + 288 + static void init_ports(PVRDMADev *dev, Error **errp) 289 + { 290 + int i; 291 + 292 + memset(dev->rdma_dev_res.ports, 0, sizeof(dev->rdma_dev_res.ports)); 293 + 294 + for (i = 0; i < MAX_PORTS; i++) { 295 + dev->rdma_dev_res.ports[i].state = PVRDMA_PORT_DOWN; 296 + 297 + dev->rdma_dev_res.ports[i].pkey_tbl = 298 + g_malloc0(sizeof(*dev->rdma_dev_res.ports[i].pkey_tbl) * 299 + MAX_PORT_PKEYS); 300 + } 301 + } 302 + 303 + static void activate_device(PVRDMADev *dev) 304 + { 305 + set_reg_val(dev, PVRDMA_REG_ERR, 0); 306 + pr_dbg("Device activated\n"); 307 + } 308 + 309 + static int unquiesce_device(PVRDMADev *dev) 310 + { 311 + pr_dbg("Device unquiesced\n"); 312 + return 0; 313 + } 314 + 315 + static int reset_device(PVRDMADev *dev) 316 + { 317 + pr_dbg("Device reset complete\n"); 318 + return 0; 319 + } 320 + 321 + static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size) 322 + { 323 + PVRDMADev *dev = opaque; 324 + uint32_t val; 325 + 326 + /* pr_dbg("addr=0x%lx, size=%d\n", addr, size); */ 327 + 328 + if (get_reg_val(dev, addr, &val)) { 329 + pr_dbg("Error trying to read REG value from address 0x%x\n", 330 + (uint32_t)addr); 331 + return -EINVAL; 332 + } 333 + 334 + trace_pvrdma_regs_read(addr, val); 335 + 336 + return val; 337 + } 338 + 339 + static void regs_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) 340 + { 341 + PVRDMADev *dev = opaque; 342 + 343 + /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */ 344 + 345 + if (set_reg_val(dev, addr, val)) { 346 + pr_err("Error trying to set REG value, addr=0x%lx, val=0x%lx\n", 347 + (uint64_t)addr, val); 348 + return; 349 + } 350 + 351 + trace_pvrdma_regs_write(addr, val); 352 + 353 + switch (addr) { 354 + case PVRDMA_REG_DSRLOW: 355 + dev->dsr_info.dma = val; 356 + break; 357 + case PVRDMA_REG_DSRHIGH: 358 + dev->dsr_info.dma |= val << 32; 359 + load_dsr(dev); 360 + init_dsr_dev_caps(dev); 361 + break; 362 + case PVRDMA_REG_CTL: 363 + switch (val) { 364 + case PVRDMA_DEVICE_CTL_ACTIVATE: 365 + activate_device(dev); 366 + break; 367 + case PVRDMA_DEVICE_CTL_UNQUIESCE: 368 + unquiesce_device(dev); 369 + break; 370 + case PVRDMA_DEVICE_CTL_RESET: 371 + reset_device(dev); 372 + break; 373 + } 374 + break; 375 + case PVRDMA_REG_IMR: 376 + pr_dbg("Interrupt mask=0x%lx\n", val); 377 + dev->interrupt_mask = val; 378 + break; 379 + case PVRDMA_REG_REQUEST: 380 + if (val == 0) { 381 + execute_command(dev); 382 + } 383 + break; 384 + default: 385 + break; 386 + } 387 + } 388 + 389 + static const MemoryRegionOps regs_ops = { 390 + .read = regs_read, 391 + .write = regs_write, 392 + .endianness = DEVICE_LITTLE_ENDIAN, 393 + .impl = { 394 + .min_access_size = sizeof(uint32_t), 395 + .max_access_size = sizeof(uint32_t), 396 + }, 397 + }; 398 + 399 + static void uar_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) 400 + { 401 + PVRDMADev *dev = opaque; 402 + 403 + /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */ 404 + 405 + switch (addr & 0xFFF) { /* Mask with 0xFFF as each UC gets page */ 406 + case PVRDMA_UAR_QP_OFFSET: 407 + pr_dbg("UAR QP command, addr=0x%x, val=0x%lx\n", (uint32_t)addr, val); 408 + if (val & PVRDMA_UAR_QP_SEND) { 409 + pvrdma_qp_send(dev, val & PVRDMA_UAR_HANDLE_MASK); 410 + } 411 + if (val & PVRDMA_UAR_QP_RECV) { 412 + pvrdma_qp_recv(dev, val & PVRDMA_UAR_HANDLE_MASK); 413 + } 414 + break; 415 + case PVRDMA_UAR_CQ_OFFSET: 416 + /* pr_dbg("UAR CQ cmd, addr=0x%x, val=0x%lx\n", (uint32_t)addr, val); */ 417 + if (val & PVRDMA_UAR_CQ_ARM) { 418 + rdma_rm_req_notify_cq(&dev->rdma_dev_res, 419 + val & PVRDMA_UAR_HANDLE_MASK, 420 + !!(val & PVRDMA_UAR_CQ_ARM_SOL)); 421 + } 422 + if (val & PVRDMA_UAR_CQ_ARM_SOL) { 423 + pr_dbg("UAR_CQ_ARM_SOL (%ld)\n", val & PVRDMA_UAR_HANDLE_MASK); 424 + } 425 + if (val & PVRDMA_UAR_CQ_POLL) { 426 + pr_dbg("UAR_CQ_POLL (%ld)\n", val & PVRDMA_UAR_HANDLE_MASK); 427 + pvrdma_cq_poll(&dev->rdma_dev_res, val & PVRDMA_UAR_HANDLE_MASK); 428 + } 429 + break; 430 + default: 431 + pr_err("Unsupported command, addr=0x%lx, val=0x%lx\n", 432 + (uint64_t)addr, val); 433 + break; 434 + } 435 + } 436 + 437 + static const MemoryRegionOps uar_ops = { 438 + .write = uar_write, 439 + .endianness = DEVICE_LITTLE_ENDIAN, 440 + .impl = { 441 + .min_access_size = sizeof(uint32_t), 442 + .max_access_size = sizeof(uint32_t), 443 + }, 444 + }; 445 + 446 + static void init_pci_config(PCIDevice *pdev) 447 + { 448 + pdev->config[PCI_INTERRUPT_PIN] = 1; 449 + } 450 + 451 + static void init_bars(PCIDevice *pdev) 452 + { 453 + PVRDMADev *dev = PVRDMA_DEV(pdev); 454 + 455 + /* BAR 0 - MSI-X */ 456 + memory_region_init(&dev->msix, OBJECT(dev), "pvrdma-msix", 457 + RDMA_BAR0_MSIX_SIZE); 458 + pci_register_bar(pdev, RDMA_MSIX_BAR_IDX, PCI_BASE_ADDRESS_SPACE_MEMORY, 459 + &dev->msix); 460 + 461 + /* BAR 1 - Registers */ 462 + memset(&dev->regs_data, 0, sizeof(dev->regs_data)); 463 + memory_region_init_io(&dev->regs, OBJECT(dev), &regs_ops, dev, 464 + "pvrdma-regs", RDMA_BAR1_REGS_SIZE); 465 + pci_register_bar(pdev, RDMA_REG_BAR_IDX, PCI_BASE_ADDRESS_SPACE_MEMORY, 466 + &dev->regs); 467 + 468 + /* BAR 2 - UAR */ 469 + memset(&dev->uar_data, 0, sizeof(dev->uar_data)); 470 + memory_region_init_io(&dev->uar, OBJECT(dev), &uar_ops, dev, "rdma-uar", 471 + RDMA_BAR2_UAR_SIZE); 472 + pci_register_bar(pdev, RDMA_UAR_BAR_IDX, PCI_BASE_ADDRESS_SPACE_MEMORY, 473 + &dev->uar); 474 + } 475 + 476 + static void init_regs(PCIDevice *pdev) 477 + { 478 + PVRDMADev *dev = PVRDMA_DEV(pdev); 479 + 480 + set_reg_val(dev, PVRDMA_REG_VERSION, PVRDMA_HW_VERSION); 481 + set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF); 482 + } 483 + 484 + static void uninit_msix(PCIDevice *pdev, int used_vectors) 485 + { 486 + PVRDMADev *dev = PVRDMA_DEV(pdev); 487 + int i; 488 + 489 + for (i = 0; i < used_vectors; i++) { 490 + msix_vector_unuse(pdev, i); 491 + } 492 + 493 + msix_uninit(pdev, &dev->msix, &dev->msix); 494 + } 495 + 496 + static int init_msix(PCIDevice *pdev, Error **errp) 497 + { 498 + PVRDMADev *dev = PVRDMA_DEV(pdev); 499 + int i; 500 + int rc; 501 + 502 + rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX, 503 + RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX, 504 + RDMA_MSIX_PBA, 0, NULL); 505 + 506 + if (rc < 0) { 507 + error_setg(errp, "Failed to initialize MSI-X"); 508 + return rc; 509 + } 510 + 511 + for (i = 0; i < RDMA_MAX_INTRS; i++) { 512 + rc = msix_vector_use(PCI_DEVICE(dev), i); 513 + if (rc < 0) { 514 + error_setg(errp, "Fail mark MSI-X vercor %d", i); 515 + uninit_msix(pdev, i); 516 + return rc; 517 + } 518 + } 519 + 520 + return 0; 521 + } 522 + 523 + static void init_dev_caps(PVRDMADev *dev) 524 + { 525 + size_t pg_tbl_bytes = TARGET_PAGE_SIZE * 526 + (TARGET_PAGE_SIZE / sizeof(uint64_t)); 527 + size_t wr_sz = MAX(sizeof(struct pvrdma_sq_wqe_hdr), 528 + sizeof(struct pvrdma_rq_wqe_hdr)); 529 + 530 + dev->dev_attr.max_qp_wr = pg_tbl_bytes / 531 + (wr_sz + sizeof(struct pvrdma_sge) * MAX_SGE) - 532 + TARGET_PAGE_SIZE; /* First page is ring state */ 533 + pr_dbg("max_qp_wr=%d\n", dev->dev_attr.max_qp_wr); 534 + 535 + dev->dev_attr.max_cqe = pg_tbl_bytes / sizeof(struct pvrdma_cqe) - 536 + TARGET_PAGE_SIZE; /* First page is ring state */ 537 + pr_dbg("max_cqe=%d\n", dev->dev_attr.max_cqe); 538 + } 539 + 540 + static int pvrdma_check_ram_shared(Object *obj, void *opaque) 541 + { 542 + bool *shared = opaque; 543 + 544 + if (object_dynamic_cast(obj, "memory-backend-ram")) { 545 + *shared = object_property_get_bool(obj, "share", NULL); 546 + } 547 + 548 + return 0; 549 + } 550 + 551 + static void pvrdma_realize(PCIDevice *pdev, Error **errp) 552 + { 553 + int rc; 554 + PVRDMADev *dev = PVRDMA_DEV(pdev); 555 + Object *memdev_root; 556 + bool ram_shared = false; 557 + 558 + pr_dbg("Initializing device %s %x.%x\n", pdev->name, 559 + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); 560 + 561 + if (TARGET_PAGE_SIZE != getpagesize()) { 562 + error_setg(errp, "Target page size must be the same as host page size"); 563 + return; 564 + } 565 + 566 + memdev_root = object_resolve_path("/objects", NULL); 567 + if (memdev_root) { 568 + object_child_foreach(memdev_root, pvrdma_check_ram_shared, &ram_shared); 569 + } 570 + if (!ram_shared) { 571 + error_setg(errp, "Only shared memory backed ram is supported"); 572 + return; 573 + } 574 + 575 + dev->dsr_info.dsr = NULL; 576 + 577 + init_pci_config(pdev); 578 + 579 + init_bars(pdev); 580 + 581 + init_regs(pdev); 582 + 583 + init_dev_caps(dev); 584 + 585 + rc = init_msix(pdev, errp); 586 + if (rc) { 587 + goto out; 588 + } 589 + 590 + rc = rdma_backend_init(&dev->backend_dev, &dev->rdma_dev_res, 591 + dev->backend_device_name, dev->backend_port_num, 592 + dev->backend_gid_idx, &dev->dev_attr, errp); 593 + if (rc) { 594 + goto out; 595 + } 596 + 597 + rc = rdma_rm_init(&dev->rdma_dev_res, &dev->dev_attr, errp); 598 + if (rc) { 599 + goto out; 600 + } 601 + 602 + init_ports(dev, errp); 603 + 604 + rc = pvrdma_qp_ops_init(); 605 + if (rc) { 606 + goto out; 607 + } 608 + 609 + out: 610 + if (rc) { 611 + error_append_hint(errp, "Device fail to load\n"); 612 + } 613 + } 614 + 615 + static void pvrdma_exit(PCIDevice *pdev) 616 + { 617 + PVRDMADev *dev = PVRDMA_DEV(pdev); 618 + 619 + pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn), 620 + PCI_FUNC(pdev->devfn)); 621 + 622 + pvrdma_qp_ops_fini(); 623 + 624 + free_ports(dev); 625 + 626 + rdma_rm_fini(&dev->rdma_dev_res); 627 + 628 + rdma_backend_fini(&dev->backend_dev); 629 + 630 + free_dsr(dev); 631 + 632 + if (msix_enabled(pdev)) { 633 + uninit_msix(pdev, RDMA_MAX_INTRS); 634 + } 635 + } 636 + 637 + static void pvrdma_class_init(ObjectClass *klass, void *data) 638 + { 639 + DeviceClass *dc = DEVICE_CLASS(klass); 640 + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); 641 + 642 + k->realize = pvrdma_realize; 643 + k->exit = pvrdma_exit; 644 + k->vendor_id = PCI_VENDOR_ID_VMWARE; 645 + k->device_id = PCI_DEVICE_ID_VMWARE_PVRDMA; 646 + k->revision = 0x00; 647 + k->class_id = PCI_CLASS_NETWORK_OTHER; 648 + 649 + dc->desc = "RDMA Device"; 650 + dc->props = pvrdma_dev_properties; 651 + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); 652 + } 653 + 654 + static const TypeInfo pvrdma_info = { 655 + .name = PVRDMA_HW_NAME, 656 + .parent = TYPE_PCI_DEVICE, 657 + .instance_size = sizeof(PVRDMADev), 658 + .class_init = pvrdma_class_init, 659 + .interfaces = (InterfaceInfo[]) { 660 + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, 661 + { } 662 + } 663 + }; 664 + 665 + static void register_types(void) 666 + { 667 + type_register_static(&pvrdma_info); 668 + } 669 + 670 + type_init(register_types)
+222
hw/rdma/vmw/pvrdma_qp_ops.c
··· 1 + /* 2 + * QEMU paravirtual RDMA - QP implementation 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #include <qemu/osdep.h> 17 + 18 + #include "../rdma_utils.h" 19 + #include "../rdma_rm.h" 20 + #include "../rdma_backend.h" 21 + 22 + #include "pvrdma.h" 23 + #include <standard-headers/rdma/vmw_pvrdma-abi.h> 24 + #include "pvrdma_qp_ops.h" 25 + 26 + typedef struct CompHandlerCtx { 27 + PVRDMADev *dev; 28 + uint32_t cq_handle; 29 + struct pvrdma_cqe cqe; 30 + } CompHandlerCtx; 31 + 32 + /* Send Queue WQE */ 33 + typedef struct PvrdmaSqWqe { 34 + struct pvrdma_sq_wqe_hdr hdr; 35 + struct pvrdma_sge sge[0]; 36 + } PvrdmaSqWqe; 37 + 38 + /* Recv Queue WQE */ 39 + typedef struct PvrdmaRqWqe { 40 + struct pvrdma_rq_wqe_hdr hdr; 41 + struct pvrdma_sge sge[0]; 42 + } PvrdmaRqWqe; 43 + 44 + /* 45 + * 1. Put CQE on send CQ ring 46 + * 2. Put CQ number on dsr completion ring 47 + * 3. Interrupt host 48 + */ 49 + static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, 50 + struct pvrdma_cqe *cqe) 51 + { 52 + struct pvrdma_cqe *cqe1; 53 + struct pvrdma_cqne *cqne; 54 + PvrdmaRing *ring; 55 + RdmaRmCQ *cq = rdma_rm_get_cq(&dev->rdma_dev_res, cq_handle); 56 + 57 + if (unlikely(!cq)) { 58 + pr_dbg("Invalid cqn %d\n", cq_handle); 59 + return -EINVAL; 60 + } 61 + 62 + ring = (PvrdmaRing *)cq->opaque; 63 + pr_dbg("ring=%p\n", ring); 64 + 65 + /* Step #1: Put CQE on CQ ring */ 66 + pr_dbg("Writing CQE\n"); 67 + cqe1 = pvrdma_ring_next_elem_write(ring); 68 + if (unlikely(!cqe1)) { 69 + return -EINVAL; 70 + } 71 + 72 + cqe1->wr_id = cqe->wr_id; 73 + cqe1->qp = cqe->qp; 74 + cqe1->opcode = cqe->opcode; 75 + cqe1->status = cqe->status; 76 + cqe1->vendor_err = cqe->vendor_err; 77 + 78 + pvrdma_ring_write_inc(ring); 79 + 80 + /* Step #2: Put CQ number on dsr completion ring */ 81 + pr_dbg("Writing CQNE\n"); 82 + cqne = pvrdma_ring_next_elem_write(&dev->dsr_info.cq); 83 + if (unlikely(!cqne)) { 84 + return -EINVAL; 85 + } 86 + 87 + cqne->info = cq_handle; 88 + pvrdma_ring_write_inc(&dev->dsr_info.cq); 89 + 90 + pr_dbg("cq->notify=%d\n", cq->notify); 91 + if (cq->notify) { 92 + cq->notify = false; 93 + post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q); 94 + } 95 + 96 + return 0; 97 + } 98 + 99 + static void pvrdma_qp_ops_comp_handler(int status, unsigned int vendor_err, 100 + void *ctx) 101 + { 102 + CompHandlerCtx *comp_ctx = (CompHandlerCtx *)ctx; 103 + 104 + pr_dbg("cq_handle=%d\n", comp_ctx->cq_handle); 105 + pr_dbg("wr_id=%ld\n", comp_ctx->cqe.wr_id); 106 + pr_dbg("status=%d\n", status); 107 + pr_dbg("vendor_err=0x%x\n", vendor_err); 108 + comp_ctx->cqe.status = status; 109 + comp_ctx->cqe.vendor_err = vendor_err; 110 + pvrdma_post_cqe(comp_ctx->dev, comp_ctx->cq_handle, &comp_ctx->cqe); 111 + g_free(ctx); 112 + } 113 + 114 + void pvrdma_qp_ops_fini(void) 115 + { 116 + rdma_backend_unregister_comp_handler(); 117 + } 118 + 119 + int pvrdma_qp_ops_init(void) 120 + { 121 + rdma_backend_register_comp_handler(pvrdma_qp_ops_comp_handler); 122 + 123 + return 0; 124 + } 125 + 126 + int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) 127 + { 128 + RdmaRmQP *qp; 129 + PvrdmaSqWqe *wqe; 130 + PvrdmaRing *ring; 131 + 132 + pr_dbg("qp_handle=%d\n", qp_handle); 133 + 134 + qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); 135 + if (unlikely(!qp)) { 136 + return -EINVAL; 137 + } 138 + 139 + ring = (PvrdmaRing *)qp->opaque; 140 + pr_dbg("sring=%p\n", ring); 141 + 142 + wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring); 143 + while (wqe) { 144 + CompHandlerCtx *comp_ctx; 145 + 146 + pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id); 147 + 148 + /* Prepare CQE */ 149 + comp_ctx = g_malloc(sizeof(CompHandlerCtx)); 150 + comp_ctx->dev = dev; 151 + comp_ctx->cq_handle = qp->send_cq_handle; 152 + comp_ctx->cqe.wr_id = wqe->hdr.wr_id; 153 + comp_ctx->cqe.qp = qp_handle; 154 + comp_ctx->cqe.opcode = wqe->hdr.opcode; 155 + 156 + rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type, 157 + (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, 158 + (union ibv_gid *)wqe->hdr.wr.ud.av.dgid, 159 + wqe->hdr.wr.ud.remote_qpn, 160 + wqe->hdr.wr.ud.remote_qkey, comp_ctx); 161 + 162 + pvrdma_ring_read_inc(ring); 163 + 164 + wqe = pvrdma_ring_next_elem_read(ring); 165 + } 166 + 167 + return 0; 168 + } 169 + 170 + int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) 171 + { 172 + RdmaRmQP *qp; 173 + PvrdmaRqWqe *wqe; 174 + PvrdmaRing *ring; 175 + 176 + pr_dbg("qp_handle=%d\n", qp_handle); 177 + 178 + qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); 179 + if (unlikely(!qp)) { 180 + return -EINVAL; 181 + } 182 + 183 + ring = &((PvrdmaRing *)qp->opaque)[1]; 184 + pr_dbg("rring=%p\n", ring); 185 + 186 + wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring); 187 + while (wqe) { 188 + CompHandlerCtx *comp_ctx; 189 + 190 + pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id); 191 + 192 + /* Prepare CQE */ 193 + comp_ctx = g_malloc(sizeof(CompHandlerCtx)); 194 + comp_ctx->dev = dev; 195 + comp_ctx->cq_handle = qp->recv_cq_handle; 196 + comp_ctx->cqe.qp = qp_handle; 197 + comp_ctx->cqe.wr_id = wqe->hdr.wr_id; 198 + 199 + rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res, 200 + &qp->backend_qp, qp->qp_type, 201 + (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, 202 + comp_ctx); 203 + 204 + pvrdma_ring_read_inc(ring); 205 + 206 + wqe = pvrdma_ring_next_elem_read(ring); 207 + } 208 + 209 + return 0; 210 + } 211 + 212 + void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle) 213 + { 214 + RdmaRmCQ *cq; 215 + 216 + cq = rdma_rm_get_cq(dev_res, cq_handle); 217 + if (!cq) { 218 + pr_dbg("Invalid CQ# %d\n", cq_handle); 219 + } 220 + 221 + rdma_backend_poll_cq(dev_res, &cq->backend_cq); 222 + }
+27
hw/rdma/vmw/pvrdma_qp_ops.h
··· 1 + /* 2 + * QEMU VMWARE paravirtual RDMA QP Operations 3 + * 4 + * Copyright (C) 2018 Oracle 5 + * Copyright (C) 2018 Red Hat Inc 6 + * 7 + * Authors: 8 + * Yuval Shaia <yuval.shaia@oracle.com> 9 + * Marcel Apfelbaum <marcel@redhat.com> 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 + * See the COPYING file in the top-level directory. 13 + * 14 + */ 15 + 16 + #ifndef PVRDMA_QP_H 17 + #define PVRDMA_QP_H 18 + 19 + #include "pvrdma.h" 20 + 21 + int pvrdma_qp_ops_init(void); 22 + void pvrdma_qp_ops_fini(void); 23 + int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle); 24 + int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle); 25 + void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle); 26 + 27 + #endif
+5
hw/rdma/vmw/trace-events
··· 1 + # See docs/tracing.txt for syntax documentation. 2 + 3 + # hw/rdma/vmw/pvrdma_main.c 4 + pvrdma_regs_read(uint64_t addr, uint64_t val) "regs[0x%"PRIx64"] = 0x%"PRIx64 5 + pvrdma_regs_write(uint64_t addr, uint64_t val) "regs[0x%"PRIx64"] = 0x%"PRIx64
+23
include/exec/memory.h
··· 436 436 Error **errp); 437 437 438 438 /** 439 + * memory_region_init_ram_shared_nomigrate: Initialize RAM memory region. 440 + * Accesses into the region will 441 + * modify memory directly. 442 + * 443 + * @mr: the #MemoryRegion to be initialized. 444 + * @owner: the object that tracks the region's reference count 445 + * @name: Region name, becomes part of RAMBlock name used in migration stream 446 + * must be unique within any device 447 + * @size: size of the region. 448 + * @share: allow remapping RAM to different addresses 449 + * @errp: pointer to Error*, to store an error if it happens. 450 + * 451 + * Note that this function is similar to memory_region_init_ram_nomigrate. 452 + * The only difference is part of the RAM region can be remapped. 453 + */ 454 + void memory_region_init_ram_shared_nomigrate(MemoryRegion *mr, 455 + struct Object *owner, 456 + const char *name, 457 + uint64_t size, 458 + bool share, 459 + Error **errp); 460 + 461 + /** 439 462 * memory_region_init_resizeable_ram: Initialize memory region with resizeable 440 463 * RAM. Accesses into the region will 441 464 * modify memory directly. Only an initial
+2 -1
include/exec/ram_addr.h
··· 80 80 Error **errp); 81 81 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, 82 82 MemoryRegion *mr, Error **errp); 83 - RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp); 83 + RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share, MemoryRegion *mr, 84 + Error **errp); 84 85 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t max_size, 85 86 void (*resized)(const char*, 86 87 uint64_t length,
+3
include/hw/pci/pci_ids.h
··· 266 266 #define PCI_VENDOR_ID_TEWS 0x1498 267 267 #define PCI_DEVICE_ID_TEWS_TPCI200 0x30C8 268 268 269 + #define PCI_VENDOR_ID_VMWARE 0x15ad 270 + #define PCI_DEVICE_ID_VMWARE_PVRDMA 0x0820 271 + 269 272 #endif
+1 -1
include/qemu/osdep.h
··· 255 255 int qemu_daemon(int nochdir, int noclose); 256 256 void *qemu_try_memalign(size_t alignment, size_t size); 257 257 void *qemu_memalign(size_t alignment, size_t size); 258 - void *qemu_anon_ram_alloc(size_t size, uint64_t *align); 258 + void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared); 259 259 void qemu_vfree(void *ptr); 260 260 void qemu_anon_ram_free(void *ptr, size_t size); 261 261
+667
include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
··· 1 + /* 2 + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of EITHER the GNU General Public License 6 + * version 2 as published by the Free Software Foundation or the BSD 7 + * 2-Clause License. This program is distributed in the hope that it 8 + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED 9 + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 10 + * See the GNU General Public License version 2 for more details at 11 + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program available in the file COPYING in the main 15 + * directory of this source tree. 16 + * 17 + * The BSD 2-Clause License 18 + * 19 + * Redistribution and use in source and binary forms, with or 20 + * without modification, are permitted provided that the following 21 + * conditions are met: 22 + * 23 + * - Redistributions of source code must retain the above 24 + * copyright notice, this list of conditions and the following 25 + * disclaimer. 26 + * 27 + * - Redistributions in binary form must reproduce the above 28 + * copyright notice, this list of conditions and the following 29 + * disclaimer in the documentation and/or other materials 30 + * provided with the distribution. 31 + * 32 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 33 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 34 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 35 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 36 + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 37 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 39 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 41 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 43 + * OF THE POSSIBILITY OF SUCH DAMAGE. 44 + */ 45 + 46 + #ifndef __PVRDMA_DEV_API_H__ 47 + #define __PVRDMA_DEV_API_H__ 48 + 49 + #include "standard-headers/linux/types.h" 50 + 51 + #include "pvrdma_verbs.h" 52 + 53 + /* 54 + * PVRDMA version macros. Some new features require updates to PVRDMA_VERSION. 55 + * These macros allow us to check for different features if necessary. 56 + */ 57 + 58 + #define PVRDMA_ROCEV1_VERSION 17 59 + #define PVRDMA_ROCEV2_VERSION 18 60 + #define PVRDMA_VERSION PVRDMA_ROCEV2_VERSION 61 + 62 + #define PVRDMA_BOARD_ID 1 63 + #define PVRDMA_REV_ID 1 64 + 65 + /* 66 + * Masks and accessors for page directory, which is a two-level lookup: 67 + * page directory -> page table -> page. Only one directory for now, but we 68 + * could expand that easily. 9 bits for tables, 9 bits for pages, gives one 69 + * gigabyte for memory regions and so forth. 70 + */ 71 + 72 + #define PVRDMA_PDIR_SHIFT 18 73 + #define PVRDMA_PTABLE_SHIFT 9 74 + #define PVRDMA_PAGE_DIR_DIR(x) (((x) >> PVRDMA_PDIR_SHIFT) & 0x1) 75 + #define PVRDMA_PAGE_DIR_TABLE(x) (((x) >> PVRDMA_PTABLE_SHIFT) & 0x1ff) 76 + #define PVRDMA_PAGE_DIR_PAGE(x) ((x) & 0x1ff) 77 + #define PVRDMA_PAGE_DIR_MAX_PAGES (1 * 512 * 512) 78 + #define PVRDMA_MAX_FAST_REG_PAGES 128 79 + 80 + /* 81 + * Max MSI-X vectors. 82 + */ 83 + 84 + #define PVRDMA_MAX_INTERRUPTS 3 85 + 86 + /* Register offsets within PCI resource on BAR1. */ 87 + #define PVRDMA_REG_VERSION 0x00 /* R: Version of device. */ 88 + #define PVRDMA_REG_DSRLOW 0x04 /* W: Device shared region low PA. */ 89 + #define PVRDMA_REG_DSRHIGH 0x08 /* W: Device shared region high PA. */ 90 + #define PVRDMA_REG_CTL 0x0c /* W: PVRDMA_DEVICE_CTL */ 91 + #define PVRDMA_REG_REQUEST 0x10 /* W: Indicate device request. */ 92 + #define PVRDMA_REG_ERR 0x14 /* R: Device error. */ 93 + #define PVRDMA_REG_ICR 0x18 /* R: Interrupt cause. */ 94 + #define PVRDMA_REG_IMR 0x1c /* R/W: Interrupt mask. */ 95 + #define PVRDMA_REG_MACL 0x20 /* R/W: MAC address low. */ 96 + #define PVRDMA_REG_MACH 0x24 /* R/W: MAC address high. */ 97 + 98 + /* Object flags. */ 99 + #define PVRDMA_CQ_FLAG_ARMED_SOL BIT(0) /* Armed for solicited-only. */ 100 + #define PVRDMA_CQ_FLAG_ARMED BIT(1) /* Armed. */ 101 + #define PVRDMA_MR_FLAG_DMA BIT(0) /* DMA region. */ 102 + #define PVRDMA_MR_FLAG_FRMR BIT(1) /* Fast reg memory region. */ 103 + 104 + /* 105 + * Atomic operation capability (masked versions are extended atomic 106 + * operations. 107 + */ 108 + 109 + #define PVRDMA_ATOMIC_OP_COMP_SWAP BIT(0) /* Compare and swap. */ 110 + #define PVRDMA_ATOMIC_OP_FETCH_ADD BIT(1) /* Fetch and add. */ 111 + #define PVRDMA_ATOMIC_OP_MASK_COMP_SWAP BIT(2) /* Masked compare and swap. */ 112 + #define PVRDMA_ATOMIC_OP_MASK_FETCH_ADD BIT(3) /* Masked fetch and add. */ 113 + 114 + /* 115 + * Base Memory Management Extension flags to support Fast Reg Memory Regions 116 + * and Fast Reg Work Requests. Each flag represents a verb operation and we 117 + * must support all of them to qualify for the BMME device cap. 118 + */ 119 + 120 + #define PVRDMA_BMME_FLAG_LOCAL_INV BIT(0) /* Local Invalidate. */ 121 + #define PVRDMA_BMME_FLAG_REMOTE_INV BIT(1) /* Remote Invalidate. */ 122 + #define PVRDMA_BMME_FLAG_FAST_REG_WR BIT(2) /* Fast Reg Work Request. */ 123 + 124 + /* 125 + * GID types. The interpretation of the gid_types bit field in the device 126 + * capabilities will depend on the device mode. For now, the device only 127 + * supports RoCE as mode, so only the different GID types for RoCE are 128 + * defined. 129 + */ 130 + 131 + #define PVRDMA_GID_TYPE_FLAG_ROCE_V1 BIT(0) 132 + #define PVRDMA_GID_TYPE_FLAG_ROCE_V2 BIT(1) 133 + 134 + /* 135 + * Version checks. This checks whether each version supports specific 136 + * capabilities from the device. 137 + */ 138 + 139 + #define PVRDMA_IS_VERSION17(_dev) \ 140 + (_dev->dsr_version == PVRDMA_ROCEV1_VERSION && \ 141 + _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1) 142 + 143 + #define PVRDMA_IS_VERSION18(_dev) \ 144 + (_dev->dsr_version >= PVRDMA_ROCEV2_VERSION && \ 145 + (_dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1 || \ 146 + _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V2)) \ 147 + 148 + #define PVRDMA_SUPPORTED(_dev) \ 149 + ((_dev->dsr->caps.mode == PVRDMA_DEVICE_MODE_ROCE) && \ 150 + (PVRDMA_IS_VERSION17(_dev) || PVRDMA_IS_VERSION18(_dev))) 151 + 152 + /* 153 + * Get capability values based on device version. 154 + */ 155 + 156 + #define PVRDMA_GET_CAP(_dev, _old_val, _val) \ 157 + ((PVRDMA_IS_VERSION18(_dev)) ? _val : _old_val) 158 + 159 + enum pvrdma_pci_resource { 160 + PVRDMA_PCI_RESOURCE_MSIX, /* BAR0: MSI-X, MMIO. */ 161 + PVRDMA_PCI_RESOURCE_REG, /* BAR1: Registers, MMIO. */ 162 + PVRDMA_PCI_RESOURCE_UAR, /* BAR2: UAR pages, MMIO, 64-bit. */ 163 + PVRDMA_PCI_RESOURCE_LAST, /* Last. */ 164 + }; 165 + 166 + enum pvrdma_device_ctl { 167 + PVRDMA_DEVICE_CTL_ACTIVATE, /* Activate device. */ 168 + PVRDMA_DEVICE_CTL_UNQUIESCE, /* Unquiesce device. */ 169 + PVRDMA_DEVICE_CTL_RESET, /* Reset device. */ 170 + }; 171 + 172 + enum pvrdma_intr_vector { 173 + PVRDMA_INTR_VECTOR_RESPONSE, /* Command response. */ 174 + PVRDMA_INTR_VECTOR_ASYNC, /* Async events. */ 175 + PVRDMA_INTR_VECTOR_CQ, /* CQ notification. */ 176 + /* Additional CQ notification vectors. */ 177 + }; 178 + 179 + enum pvrdma_intr_cause { 180 + PVRDMA_INTR_CAUSE_RESPONSE = (1 << PVRDMA_INTR_VECTOR_RESPONSE), 181 + PVRDMA_INTR_CAUSE_ASYNC = (1 << PVRDMA_INTR_VECTOR_ASYNC), 182 + PVRDMA_INTR_CAUSE_CQ = (1 << PVRDMA_INTR_VECTOR_CQ), 183 + }; 184 + 185 + enum pvrdma_gos_bits { 186 + PVRDMA_GOS_BITS_UNK, /* Unknown. */ 187 + PVRDMA_GOS_BITS_32, /* 32-bit. */ 188 + PVRDMA_GOS_BITS_64, /* 64-bit. */ 189 + }; 190 + 191 + enum pvrdma_gos_type { 192 + PVRDMA_GOS_TYPE_UNK, /* Unknown. */ 193 + PVRDMA_GOS_TYPE_LINUX, /* Linux. */ 194 + }; 195 + 196 + enum pvrdma_device_mode { 197 + PVRDMA_DEVICE_MODE_ROCE, /* RoCE. */ 198 + PVRDMA_DEVICE_MODE_IWARP, /* iWarp. */ 199 + PVRDMA_DEVICE_MODE_IB, /* InfiniBand. */ 200 + }; 201 + 202 + struct pvrdma_gos_info { 203 + uint32_t gos_bits:2; /* W: PVRDMA_GOS_BITS_ */ 204 + uint32_t gos_type:4; /* W: PVRDMA_GOS_TYPE_ */ 205 + uint32_t gos_ver:16; /* W: Guest OS version. */ 206 + uint32_t gos_misc:10; /* W: Other. */ 207 + uint32_t pad; /* Pad to 8-byte alignment. */ 208 + }; 209 + 210 + struct pvrdma_device_caps { 211 + uint64_t fw_ver; /* R: Query device. */ 212 + uint64_t node_guid; 213 + uint64_t sys_image_guid; 214 + uint64_t max_mr_size; 215 + uint64_t page_size_cap; 216 + uint64_t atomic_arg_sizes; /* EX verbs. */ 217 + uint32_t ex_comp_mask; /* EX verbs. */ 218 + uint32_t device_cap_flags2; /* EX verbs. */ 219 + uint32_t max_fa_bit_boundary; /* EX verbs. */ 220 + uint32_t log_max_atomic_inline_arg; /* EX verbs. */ 221 + uint32_t vendor_id; 222 + uint32_t vendor_part_id; 223 + uint32_t hw_ver; 224 + uint32_t max_qp; 225 + uint32_t max_qp_wr; 226 + uint32_t device_cap_flags; 227 + uint32_t max_sge; 228 + uint32_t max_sge_rd; 229 + uint32_t max_cq; 230 + uint32_t max_cqe; 231 + uint32_t max_mr; 232 + uint32_t max_pd; 233 + uint32_t max_qp_rd_atom; 234 + uint32_t max_ee_rd_atom; 235 + uint32_t max_res_rd_atom; 236 + uint32_t max_qp_init_rd_atom; 237 + uint32_t max_ee_init_rd_atom; 238 + uint32_t max_ee; 239 + uint32_t max_rdd; 240 + uint32_t max_mw; 241 + uint32_t max_raw_ipv6_qp; 242 + uint32_t max_raw_ethy_qp; 243 + uint32_t max_mcast_grp; 244 + uint32_t max_mcast_qp_attach; 245 + uint32_t max_total_mcast_qp_attach; 246 + uint32_t max_ah; 247 + uint32_t max_fmr; 248 + uint32_t max_map_per_fmr; 249 + uint32_t max_srq; 250 + uint32_t max_srq_wr; 251 + uint32_t max_srq_sge; 252 + uint32_t max_uar; 253 + uint32_t gid_tbl_len; 254 + uint16_t max_pkeys; 255 + uint8_t local_ca_ack_delay; 256 + uint8_t phys_port_cnt; 257 + uint8_t mode; /* PVRDMA_DEVICE_MODE_ */ 258 + uint8_t atomic_ops; /* PVRDMA_ATOMIC_OP_* bits */ 259 + uint8_t bmme_flags; /* FRWR Mem Mgmt Extensions */ 260 + uint8_t gid_types; /* PVRDMA_GID_TYPE_FLAG_ */ 261 + uint32_t max_fast_reg_page_list_len; 262 + }; 263 + 264 + struct pvrdma_ring_page_info { 265 + uint32_t num_pages; /* Num pages incl. header. */ 266 + uint32_t reserved; /* Reserved. */ 267 + uint64_t pdir_dma; /* Page directory PA. */ 268 + }; 269 + 270 + #pragma pack(push, 1) 271 + 272 + struct pvrdma_device_shared_region { 273 + uint32_t driver_version; /* W: Driver version. */ 274 + uint32_t pad; /* Pad to 8-byte align. */ 275 + struct pvrdma_gos_info gos_info; /* W: Guest OS information. */ 276 + uint64_t cmd_slot_dma; /* W: Command slot address. */ 277 + uint64_t resp_slot_dma; /* W: Response slot address. */ 278 + struct pvrdma_ring_page_info async_ring_pages; 279 + /* W: Async ring page info. */ 280 + struct pvrdma_ring_page_info cq_ring_pages; 281 + /* W: CQ ring page info. */ 282 + uint32_t uar_pfn; /* W: UAR pageframe. */ 283 + uint32_t pad2; /* Pad to 8-byte align. */ 284 + struct pvrdma_device_caps caps; /* R: Device capabilities. */ 285 + }; 286 + 287 + #pragma pack(pop) 288 + 289 + /* Event types. Currently a 1:1 mapping with enum ib_event. */ 290 + enum pvrdma_eqe_type { 291 + PVRDMA_EVENT_CQ_ERR, 292 + PVRDMA_EVENT_QP_FATAL, 293 + PVRDMA_EVENT_QP_REQ_ERR, 294 + PVRDMA_EVENT_QP_ACCESS_ERR, 295 + PVRDMA_EVENT_COMM_EST, 296 + PVRDMA_EVENT_SQ_DRAINED, 297 + PVRDMA_EVENT_PATH_MIG, 298 + PVRDMA_EVENT_PATH_MIG_ERR, 299 + PVRDMA_EVENT_DEVICE_FATAL, 300 + PVRDMA_EVENT_PORT_ACTIVE, 301 + PVRDMA_EVENT_PORT_ERR, 302 + PVRDMA_EVENT_LID_CHANGE, 303 + PVRDMA_EVENT_PKEY_CHANGE, 304 + PVRDMA_EVENT_SM_CHANGE, 305 + PVRDMA_EVENT_SRQ_ERR, 306 + PVRDMA_EVENT_SRQ_LIMIT_REACHED, 307 + PVRDMA_EVENT_QP_LAST_WQE_REACHED, 308 + PVRDMA_EVENT_CLIENT_REREGISTER, 309 + PVRDMA_EVENT_GID_CHANGE, 310 + }; 311 + 312 + /* Event queue element. */ 313 + struct pvrdma_eqe { 314 + uint32_t type; /* Event type. */ 315 + uint32_t info; /* Handle, other. */ 316 + }; 317 + 318 + /* CQ notification queue element. */ 319 + struct pvrdma_cqne { 320 + uint32_t info; /* Handle */ 321 + }; 322 + 323 + enum { 324 + PVRDMA_CMD_FIRST, 325 + PVRDMA_CMD_QUERY_PORT = PVRDMA_CMD_FIRST, 326 + PVRDMA_CMD_QUERY_PKEY, 327 + PVRDMA_CMD_CREATE_PD, 328 + PVRDMA_CMD_DESTROY_PD, 329 + PVRDMA_CMD_CREATE_MR, 330 + PVRDMA_CMD_DESTROY_MR, 331 + PVRDMA_CMD_CREATE_CQ, 332 + PVRDMA_CMD_RESIZE_CQ, 333 + PVRDMA_CMD_DESTROY_CQ, 334 + PVRDMA_CMD_CREATE_QP, 335 + PVRDMA_CMD_MODIFY_QP, 336 + PVRDMA_CMD_QUERY_QP, 337 + PVRDMA_CMD_DESTROY_QP, 338 + PVRDMA_CMD_CREATE_UC, 339 + PVRDMA_CMD_DESTROY_UC, 340 + PVRDMA_CMD_CREATE_BIND, 341 + PVRDMA_CMD_DESTROY_BIND, 342 + PVRDMA_CMD_CREATE_SRQ, 343 + PVRDMA_CMD_MODIFY_SRQ, 344 + PVRDMA_CMD_QUERY_SRQ, 345 + PVRDMA_CMD_DESTROY_SRQ, 346 + PVRDMA_CMD_MAX, 347 + }; 348 + 349 + enum { 350 + PVRDMA_CMD_FIRST_RESP = (1 << 31), 351 + PVRDMA_CMD_QUERY_PORT_RESP = PVRDMA_CMD_FIRST_RESP, 352 + PVRDMA_CMD_QUERY_PKEY_RESP, 353 + PVRDMA_CMD_CREATE_PD_RESP, 354 + PVRDMA_CMD_DESTROY_PD_RESP_NOOP, 355 + PVRDMA_CMD_CREATE_MR_RESP, 356 + PVRDMA_CMD_DESTROY_MR_RESP_NOOP, 357 + PVRDMA_CMD_CREATE_CQ_RESP, 358 + PVRDMA_CMD_RESIZE_CQ_RESP, 359 + PVRDMA_CMD_DESTROY_CQ_RESP_NOOP, 360 + PVRDMA_CMD_CREATE_QP_RESP, 361 + PVRDMA_CMD_MODIFY_QP_RESP, 362 + PVRDMA_CMD_QUERY_QP_RESP, 363 + PVRDMA_CMD_DESTROY_QP_RESP, 364 + PVRDMA_CMD_CREATE_UC_RESP, 365 + PVRDMA_CMD_DESTROY_UC_RESP_NOOP, 366 + PVRDMA_CMD_CREATE_BIND_RESP_NOOP, 367 + PVRDMA_CMD_DESTROY_BIND_RESP_NOOP, 368 + PVRDMA_CMD_CREATE_SRQ_RESP, 369 + PVRDMA_CMD_MODIFY_SRQ_RESP, 370 + PVRDMA_CMD_QUERY_SRQ_RESP, 371 + PVRDMA_CMD_DESTROY_SRQ_RESP, 372 + PVRDMA_CMD_MAX_RESP, 373 + }; 374 + 375 + struct pvrdma_cmd_hdr { 376 + uint64_t response; /* Key for response lookup. */ 377 + uint32_t cmd; /* PVRDMA_CMD_ */ 378 + uint32_t reserved; /* Reserved. */ 379 + }; 380 + 381 + struct pvrdma_cmd_resp_hdr { 382 + uint64_t response; /* From cmd hdr. */ 383 + uint32_t ack; /* PVRDMA_CMD_XXX_RESP */ 384 + uint8_t err; /* Error. */ 385 + uint8_t reserved[3]; /* Reserved. */ 386 + }; 387 + 388 + struct pvrdma_cmd_query_port { 389 + struct pvrdma_cmd_hdr hdr; 390 + uint8_t port_num; 391 + uint8_t reserved[7]; 392 + }; 393 + 394 + struct pvrdma_cmd_query_port_resp { 395 + struct pvrdma_cmd_resp_hdr hdr; 396 + struct pvrdma_port_attr attrs; 397 + }; 398 + 399 + struct pvrdma_cmd_query_pkey { 400 + struct pvrdma_cmd_hdr hdr; 401 + uint8_t port_num; 402 + uint8_t index; 403 + uint8_t reserved[6]; 404 + }; 405 + 406 + struct pvrdma_cmd_query_pkey_resp { 407 + struct pvrdma_cmd_resp_hdr hdr; 408 + uint16_t pkey; 409 + uint8_t reserved[6]; 410 + }; 411 + 412 + struct pvrdma_cmd_create_uc { 413 + struct pvrdma_cmd_hdr hdr; 414 + uint32_t pfn; /* UAR page frame number */ 415 + uint8_t reserved[4]; 416 + }; 417 + 418 + struct pvrdma_cmd_create_uc_resp { 419 + struct pvrdma_cmd_resp_hdr hdr; 420 + uint32_t ctx_handle; 421 + uint8_t reserved[4]; 422 + }; 423 + 424 + struct pvrdma_cmd_destroy_uc { 425 + struct pvrdma_cmd_hdr hdr; 426 + uint32_t ctx_handle; 427 + uint8_t reserved[4]; 428 + }; 429 + 430 + struct pvrdma_cmd_create_pd { 431 + struct pvrdma_cmd_hdr hdr; 432 + uint32_t ctx_handle; 433 + uint8_t reserved[4]; 434 + }; 435 + 436 + struct pvrdma_cmd_create_pd_resp { 437 + struct pvrdma_cmd_resp_hdr hdr; 438 + uint32_t pd_handle; 439 + uint8_t reserved[4]; 440 + }; 441 + 442 + struct pvrdma_cmd_destroy_pd { 443 + struct pvrdma_cmd_hdr hdr; 444 + uint32_t pd_handle; 445 + uint8_t reserved[4]; 446 + }; 447 + 448 + struct pvrdma_cmd_create_mr { 449 + struct pvrdma_cmd_hdr hdr; 450 + uint64_t start; 451 + uint64_t length; 452 + uint64_t pdir_dma; 453 + uint32_t pd_handle; 454 + uint32_t access_flags; 455 + uint32_t flags; 456 + uint32_t nchunks; 457 + }; 458 + 459 + struct pvrdma_cmd_create_mr_resp { 460 + struct pvrdma_cmd_resp_hdr hdr; 461 + uint32_t mr_handle; 462 + uint32_t lkey; 463 + uint32_t rkey; 464 + uint8_t reserved[4]; 465 + }; 466 + 467 + struct pvrdma_cmd_destroy_mr { 468 + struct pvrdma_cmd_hdr hdr; 469 + uint32_t mr_handle; 470 + uint8_t reserved[4]; 471 + }; 472 + 473 + struct pvrdma_cmd_create_cq { 474 + struct pvrdma_cmd_hdr hdr; 475 + uint64_t pdir_dma; 476 + uint32_t ctx_handle; 477 + uint32_t cqe; 478 + uint32_t nchunks; 479 + uint8_t reserved[4]; 480 + }; 481 + 482 + struct pvrdma_cmd_create_cq_resp { 483 + struct pvrdma_cmd_resp_hdr hdr; 484 + uint32_t cq_handle; 485 + uint32_t cqe; 486 + }; 487 + 488 + struct pvrdma_cmd_resize_cq { 489 + struct pvrdma_cmd_hdr hdr; 490 + uint32_t cq_handle; 491 + uint32_t cqe; 492 + }; 493 + 494 + struct pvrdma_cmd_resize_cq_resp { 495 + struct pvrdma_cmd_resp_hdr hdr; 496 + uint32_t cqe; 497 + uint8_t reserved[4]; 498 + }; 499 + 500 + struct pvrdma_cmd_destroy_cq { 501 + struct pvrdma_cmd_hdr hdr; 502 + uint32_t cq_handle; 503 + uint8_t reserved[4]; 504 + }; 505 + 506 + struct pvrdma_cmd_create_srq { 507 + struct pvrdma_cmd_hdr hdr; 508 + uint64_t pdir_dma; 509 + uint32_t pd_handle; 510 + uint32_t nchunks; 511 + struct pvrdma_srq_attr attrs; 512 + uint8_t srq_type; 513 + uint8_t reserved[7]; 514 + }; 515 + 516 + struct pvrdma_cmd_create_srq_resp { 517 + struct pvrdma_cmd_resp_hdr hdr; 518 + uint32_t srqn; 519 + uint8_t reserved[4]; 520 + }; 521 + 522 + struct pvrdma_cmd_modify_srq { 523 + struct pvrdma_cmd_hdr hdr; 524 + uint32_t srq_handle; 525 + uint32_t attr_mask; 526 + struct pvrdma_srq_attr attrs; 527 + }; 528 + 529 + struct pvrdma_cmd_query_srq { 530 + struct pvrdma_cmd_hdr hdr; 531 + uint32_t srq_handle; 532 + uint8_t reserved[4]; 533 + }; 534 + 535 + struct pvrdma_cmd_query_srq_resp { 536 + struct pvrdma_cmd_resp_hdr hdr; 537 + struct pvrdma_srq_attr attrs; 538 + }; 539 + 540 + struct pvrdma_cmd_destroy_srq { 541 + struct pvrdma_cmd_hdr hdr; 542 + uint32_t srq_handle; 543 + uint8_t reserved[4]; 544 + }; 545 + 546 + struct pvrdma_cmd_create_qp { 547 + struct pvrdma_cmd_hdr hdr; 548 + uint64_t pdir_dma; 549 + uint32_t pd_handle; 550 + uint32_t send_cq_handle; 551 + uint32_t recv_cq_handle; 552 + uint32_t srq_handle; 553 + uint32_t max_send_wr; 554 + uint32_t max_recv_wr; 555 + uint32_t max_send_sge; 556 + uint32_t max_recv_sge; 557 + uint32_t max_inline_data; 558 + uint32_t lkey; 559 + uint32_t access_flags; 560 + uint16_t total_chunks; 561 + uint16_t send_chunks; 562 + uint16_t max_atomic_arg; 563 + uint8_t sq_sig_all; 564 + uint8_t qp_type; 565 + uint8_t is_srq; 566 + uint8_t reserved[3]; 567 + }; 568 + 569 + struct pvrdma_cmd_create_qp_resp { 570 + struct pvrdma_cmd_resp_hdr hdr; 571 + uint32_t qpn; 572 + uint32_t max_send_wr; 573 + uint32_t max_recv_wr; 574 + uint32_t max_send_sge; 575 + uint32_t max_recv_sge; 576 + uint32_t max_inline_data; 577 + }; 578 + 579 + struct pvrdma_cmd_modify_qp { 580 + struct pvrdma_cmd_hdr hdr; 581 + uint32_t qp_handle; 582 + uint32_t attr_mask; 583 + struct pvrdma_qp_attr attrs; 584 + }; 585 + 586 + struct pvrdma_cmd_query_qp { 587 + struct pvrdma_cmd_hdr hdr; 588 + uint32_t qp_handle; 589 + uint32_t attr_mask; 590 + }; 591 + 592 + struct pvrdma_cmd_query_qp_resp { 593 + struct pvrdma_cmd_resp_hdr hdr; 594 + struct pvrdma_qp_attr attrs; 595 + }; 596 + 597 + struct pvrdma_cmd_destroy_qp { 598 + struct pvrdma_cmd_hdr hdr; 599 + uint32_t qp_handle; 600 + uint8_t reserved[4]; 601 + }; 602 + 603 + struct pvrdma_cmd_destroy_qp_resp { 604 + struct pvrdma_cmd_resp_hdr hdr; 605 + uint32_t events_reported; 606 + uint8_t reserved[4]; 607 + }; 608 + 609 + struct pvrdma_cmd_create_bind { 610 + struct pvrdma_cmd_hdr hdr; 611 + uint32_t mtu; 612 + uint32_t vlan; 613 + uint32_t index; 614 + uint8_t new_gid[16]; 615 + uint8_t gid_type; 616 + uint8_t reserved[3]; 617 + }; 618 + 619 + struct pvrdma_cmd_destroy_bind { 620 + struct pvrdma_cmd_hdr hdr; 621 + uint32_t index; 622 + uint8_t dest_gid[16]; 623 + uint8_t reserved[4]; 624 + }; 625 + 626 + union pvrdma_cmd_req { 627 + struct pvrdma_cmd_hdr hdr; 628 + struct pvrdma_cmd_query_port query_port; 629 + struct pvrdma_cmd_query_pkey query_pkey; 630 + struct pvrdma_cmd_create_uc create_uc; 631 + struct pvrdma_cmd_destroy_uc destroy_uc; 632 + struct pvrdma_cmd_create_pd create_pd; 633 + struct pvrdma_cmd_destroy_pd destroy_pd; 634 + struct pvrdma_cmd_create_mr create_mr; 635 + struct pvrdma_cmd_destroy_mr destroy_mr; 636 + struct pvrdma_cmd_create_cq create_cq; 637 + struct pvrdma_cmd_resize_cq resize_cq; 638 + struct pvrdma_cmd_destroy_cq destroy_cq; 639 + struct pvrdma_cmd_create_qp create_qp; 640 + struct pvrdma_cmd_modify_qp modify_qp; 641 + struct pvrdma_cmd_query_qp query_qp; 642 + struct pvrdma_cmd_destroy_qp destroy_qp; 643 + struct pvrdma_cmd_create_bind create_bind; 644 + struct pvrdma_cmd_destroy_bind destroy_bind; 645 + struct pvrdma_cmd_create_srq create_srq; 646 + struct pvrdma_cmd_modify_srq modify_srq; 647 + struct pvrdma_cmd_query_srq query_srq; 648 + struct pvrdma_cmd_destroy_srq destroy_srq; 649 + }; 650 + 651 + union pvrdma_cmd_resp { 652 + struct pvrdma_cmd_resp_hdr hdr; 653 + struct pvrdma_cmd_query_port_resp query_port_resp; 654 + struct pvrdma_cmd_query_pkey_resp query_pkey_resp; 655 + struct pvrdma_cmd_create_uc_resp create_uc_resp; 656 + struct pvrdma_cmd_create_pd_resp create_pd_resp; 657 + struct pvrdma_cmd_create_mr_resp create_mr_resp; 658 + struct pvrdma_cmd_create_cq_resp create_cq_resp; 659 + struct pvrdma_cmd_resize_cq_resp resize_cq_resp; 660 + struct pvrdma_cmd_create_qp_resp create_qp_resp; 661 + struct pvrdma_cmd_query_qp_resp query_qp_resp; 662 + struct pvrdma_cmd_destroy_qp_resp destroy_qp_resp; 663 + struct pvrdma_cmd_create_srq_resp create_srq_resp; 664 + struct pvrdma_cmd_query_srq_resp query_srq_resp; 665 + }; 666 + 667 + #endif /* __PVRDMA_DEV_API_H__ */
+114
include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
··· 1 + /* 2 + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of EITHER the GNU General Public License 6 + * version 2 as published by the Free Software Foundation or the BSD 7 + * 2-Clause License. This program is distributed in the hope that it 8 + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED 9 + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 10 + * See the GNU General Public License version 2 for more details at 11 + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program available in the file COPYING in the main 15 + * directory of this source tree. 16 + * 17 + * The BSD 2-Clause License 18 + * 19 + * Redistribution and use in source and binary forms, with or 20 + * without modification, are permitted provided that the following 21 + * conditions are met: 22 + * 23 + * - Redistributions of source code must retain the above 24 + * copyright notice, this list of conditions and the following 25 + * disclaimer. 26 + * 27 + * - Redistributions in binary form must reproduce the above 28 + * copyright notice, this list of conditions and the following 29 + * disclaimer in the documentation and/or other materials 30 + * provided with the distribution. 31 + * 32 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 33 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 34 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 35 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 36 + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 37 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 39 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 41 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 43 + * OF THE POSSIBILITY OF SUCH DAMAGE. 44 + */ 45 + 46 + #ifndef __PVRDMA_RING_H__ 47 + #define __PVRDMA_RING_H__ 48 + 49 + #include "standard-headers/linux/types.h" 50 + 51 + #define PVRDMA_INVALID_IDX -1 /* Invalid index. */ 52 + 53 + struct pvrdma_ring { 54 + int prod_tail; /* Producer tail. */ 55 + int cons_head; /* Consumer head. */ 56 + }; 57 + 58 + struct pvrdma_ring_state { 59 + struct pvrdma_ring tx; /* Tx ring. */ 60 + struct pvrdma_ring rx; /* Rx ring. */ 61 + }; 62 + 63 + static inline int pvrdma_idx_valid(uint32_t idx, uint32_t max_elems) 64 + { 65 + /* Generates fewer instructions than a less-than. */ 66 + return (idx & ~((max_elems << 1) - 1)) == 0; 67 + } 68 + 69 + static inline int32_t pvrdma_idx(int *var, uint32_t max_elems) 70 + { 71 + const unsigned int idx = atomic_read(var); 72 + 73 + if (pvrdma_idx_valid(idx, max_elems)) 74 + return idx & (max_elems - 1); 75 + return PVRDMA_INVALID_IDX; 76 + } 77 + 78 + static inline void pvrdma_idx_ring_inc(int *var, uint32_t max_elems) 79 + { 80 + uint32_t idx = atomic_read(var) + 1; /* Increment. */ 81 + 82 + idx &= (max_elems << 1) - 1; /* Modulo size, flip gen. */ 83 + atomic_set(var, idx); 84 + } 85 + 86 + static inline int32_t pvrdma_idx_ring_has_space(const struct pvrdma_ring *r, 87 + uint32_t max_elems, uint32_t *out_tail) 88 + { 89 + const uint32_t tail = atomic_read(&r->prod_tail); 90 + const uint32_t head = atomic_read(&r->cons_head); 91 + 92 + if (pvrdma_idx_valid(tail, max_elems) && 93 + pvrdma_idx_valid(head, max_elems)) { 94 + *out_tail = tail & (max_elems - 1); 95 + return tail != (head ^ max_elems); 96 + } 97 + return PVRDMA_INVALID_IDX; 98 + } 99 + 100 + static inline int32_t pvrdma_idx_ring_has_data(const struct pvrdma_ring *r, 101 + uint32_t max_elems, uint32_t *out_head) 102 + { 103 + const uint32_t tail = atomic_read(&r->prod_tail); 104 + const uint32_t head = atomic_read(&r->cons_head); 105 + 106 + if (pvrdma_idx_valid(tail, max_elems) && 107 + pvrdma_idx_valid(head, max_elems)) { 108 + *out_head = head & (max_elems - 1); 109 + return tail != head; 110 + } 111 + return PVRDMA_INVALID_IDX; 112 + } 113 + 114 + #endif /* __PVRDMA_RING_H__ */
+383
include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
··· 1 + /* 2 + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of EITHER the GNU General Public License 6 + * version 2 as published by the Free Software Foundation or the BSD 7 + * 2-Clause License. This program is distributed in the hope that it 8 + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED 9 + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 10 + * See the GNU General Public License version 2 for more details at 11 + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program available in the file COPYING in the main 15 + * directory of this source tree. 16 + * 17 + * The BSD 2-Clause License 18 + * 19 + * Redistribution and use in source and binary forms, with or 20 + * without modification, are permitted provided that the following 21 + * conditions are met: 22 + * 23 + * - Redistributions of source code must retain the above 24 + * copyright notice, this list of conditions and the following 25 + * disclaimer. 26 + * 27 + * - Redistributions in binary form must reproduce the above 28 + * copyright notice, this list of conditions and the following 29 + * disclaimer in the documentation and/or other materials 30 + * provided with the distribution. 31 + * 32 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 33 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 34 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 35 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 36 + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 37 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 39 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 41 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 43 + * OF THE POSSIBILITY OF SUCH DAMAGE. 44 + */ 45 + 46 + #ifndef __PVRDMA_VERBS_H__ 47 + #define __PVRDMA_VERBS_H__ 48 + 49 + #include "standard-headers/linux/types.h" 50 + 51 + union pvrdma_gid { 52 + uint8_t raw[16]; 53 + struct { 54 + uint64_t subnet_prefix; 55 + uint64_t interface_id; 56 + } global; 57 + }; 58 + 59 + enum pvrdma_link_layer { 60 + PVRDMA_LINK_LAYER_UNSPECIFIED, 61 + PVRDMA_LINK_LAYER_INFINIBAND, 62 + PVRDMA_LINK_LAYER_ETHERNET, 63 + }; 64 + 65 + enum pvrdma_mtu { 66 + PVRDMA_MTU_256 = 1, 67 + PVRDMA_MTU_512 = 2, 68 + PVRDMA_MTU_1024 = 3, 69 + PVRDMA_MTU_2048 = 4, 70 + PVRDMA_MTU_4096 = 5, 71 + }; 72 + 73 + static inline int pvrdma_mtu_enum_to_int(enum pvrdma_mtu mtu) 74 + { 75 + switch (mtu) { 76 + case PVRDMA_MTU_256: return 256; 77 + case PVRDMA_MTU_512: return 512; 78 + case PVRDMA_MTU_1024: return 1024; 79 + case PVRDMA_MTU_2048: return 2048; 80 + case PVRDMA_MTU_4096: return 4096; 81 + default: return -1; 82 + } 83 + } 84 + 85 + static inline enum pvrdma_mtu pvrdma_mtu_int_to_enum(int mtu) 86 + { 87 + switch (mtu) { 88 + case 256: return PVRDMA_MTU_256; 89 + case 512: return PVRDMA_MTU_512; 90 + case 1024: return PVRDMA_MTU_1024; 91 + case 2048: return PVRDMA_MTU_2048; 92 + case 4096: 93 + default: return PVRDMA_MTU_4096; 94 + } 95 + } 96 + 97 + enum pvrdma_port_state { 98 + PVRDMA_PORT_NOP = 0, 99 + PVRDMA_PORT_DOWN = 1, 100 + PVRDMA_PORT_INIT = 2, 101 + PVRDMA_PORT_ARMED = 3, 102 + PVRDMA_PORT_ACTIVE = 4, 103 + PVRDMA_PORT_ACTIVE_DEFER = 5, 104 + }; 105 + 106 + enum pvrdma_port_cap_flags { 107 + PVRDMA_PORT_SM = 1 << 1, 108 + PVRDMA_PORT_NOTICE_SUP = 1 << 2, 109 + PVRDMA_PORT_TRAP_SUP = 1 << 3, 110 + PVRDMA_PORT_OPT_IPD_SUP = 1 << 4, 111 + PVRDMA_PORT_AUTO_MIGR_SUP = 1 << 5, 112 + PVRDMA_PORT_SL_MAP_SUP = 1 << 6, 113 + PVRDMA_PORT_MKEY_NVRAM = 1 << 7, 114 + PVRDMA_PORT_PKEY_NVRAM = 1 << 8, 115 + PVRDMA_PORT_LED_INFO_SUP = 1 << 9, 116 + PVRDMA_PORT_SM_DISABLED = 1 << 10, 117 + PVRDMA_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, 118 + PVRDMA_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, 119 + PVRDMA_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, 120 + PVRDMA_PORT_CM_SUP = 1 << 16, 121 + PVRDMA_PORT_SNMP_TUNNEL_SUP = 1 << 17, 122 + PVRDMA_PORT_REINIT_SUP = 1 << 18, 123 + PVRDMA_PORT_DEVICE_MGMT_SUP = 1 << 19, 124 + PVRDMA_PORT_VENDOR_CLASS_SUP = 1 << 20, 125 + PVRDMA_PORT_DR_NOTICE_SUP = 1 << 21, 126 + PVRDMA_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, 127 + PVRDMA_PORT_BOOT_MGMT_SUP = 1 << 23, 128 + PVRDMA_PORT_LINK_LATENCY_SUP = 1 << 24, 129 + PVRDMA_PORT_CLIENT_REG_SUP = 1 << 25, 130 + PVRDMA_PORT_IP_BASED_GIDS = 1 << 26, 131 + PVRDMA_PORT_CAP_FLAGS_MAX = PVRDMA_PORT_IP_BASED_GIDS, 132 + }; 133 + 134 + enum pvrdma_port_width { 135 + PVRDMA_WIDTH_1X = 1, 136 + PVRDMA_WIDTH_4X = 2, 137 + PVRDMA_WIDTH_8X = 4, 138 + PVRDMA_WIDTH_12X = 8, 139 + }; 140 + 141 + static inline int pvrdma_width_enum_to_int(enum pvrdma_port_width width) 142 + { 143 + switch (width) { 144 + case PVRDMA_WIDTH_1X: return 1; 145 + case PVRDMA_WIDTH_4X: return 4; 146 + case PVRDMA_WIDTH_8X: return 8; 147 + case PVRDMA_WIDTH_12X: return 12; 148 + default: return -1; 149 + } 150 + } 151 + 152 + enum pvrdma_port_speed { 153 + PVRDMA_SPEED_SDR = 1, 154 + PVRDMA_SPEED_DDR = 2, 155 + PVRDMA_SPEED_QDR = 4, 156 + PVRDMA_SPEED_FDR10 = 8, 157 + PVRDMA_SPEED_FDR = 16, 158 + PVRDMA_SPEED_EDR = 32, 159 + }; 160 + 161 + struct pvrdma_port_attr { 162 + enum pvrdma_port_state state; 163 + enum pvrdma_mtu max_mtu; 164 + enum pvrdma_mtu active_mtu; 165 + uint32_t gid_tbl_len; 166 + uint32_t port_cap_flags; 167 + uint32_t max_msg_sz; 168 + uint32_t bad_pkey_cntr; 169 + uint32_t qkey_viol_cntr; 170 + uint16_t pkey_tbl_len; 171 + uint16_t lid; 172 + uint16_t sm_lid; 173 + uint8_t lmc; 174 + uint8_t max_vl_num; 175 + uint8_t sm_sl; 176 + uint8_t subnet_timeout; 177 + uint8_t init_type_reply; 178 + uint8_t active_width; 179 + uint8_t active_speed; 180 + uint8_t phys_state; 181 + uint8_t reserved[2]; 182 + }; 183 + 184 + struct pvrdma_global_route { 185 + union pvrdma_gid dgid; 186 + uint32_t flow_label; 187 + uint8_t sgid_index; 188 + uint8_t hop_limit; 189 + uint8_t traffic_class; 190 + uint8_t reserved; 191 + }; 192 + 193 + struct pvrdma_grh { 194 + uint32_t version_tclass_flow; 195 + uint16_t paylen; 196 + uint8_t next_hdr; 197 + uint8_t hop_limit; 198 + union pvrdma_gid sgid; 199 + union pvrdma_gid dgid; 200 + }; 201 + 202 + enum pvrdma_ah_flags { 203 + PVRDMA_AH_GRH = 1, 204 + }; 205 + 206 + enum pvrdma_rate { 207 + PVRDMA_RATE_PORT_CURRENT = 0, 208 + PVRDMA_RATE_2_5_GBPS = 2, 209 + PVRDMA_RATE_5_GBPS = 5, 210 + PVRDMA_RATE_10_GBPS = 3, 211 + PVRDMA_RATE_20_GBPS = 6, 212 + PVRDMA_RATE_30_GBPS = 4, 213 + PVRDMA_RATE_40_GBPS = 7, 214 + PVRDMA_RATE_60_GBPS = 8, 215 + PVRDMA_RATE_80_GBPS = 9, 216 + PVRDMA_RATE_120_GBPS = 10, 217 + PVRDMA_RATE_14_GBPS = 11, 218 + PVRDMA_RATE_56_GBPS = 12, 219 + PVRDMA_RATE_112_GBPS = 13, 220 + PVRDMA_RATE_168_GBPS = 14, 221 + PVRDMA_RATE_25_GBPS = 15, 222 + PVRDMA_RATE_100_GBPS = 16, 223 + PVRDMA_RATE_200_GBPS = 17, 224 + PVRDMA_RATE_300_GBPS = 18, 225 + }; 226 + 227 + struct pvrdma_ah_attr { 228 + struct pvrdma_global_route grh; 229 + uint16_t dlid; 230 + uint16_t vlan_id; 231 + uint8_t sl; 232 + uint8_t src_path_bits; 233 + uint8_t static_rate; 234 + uint8_t ah_flags; 235 + uint8_t port_num; 236 + uint8_t dmac[6]; 237 + uint8_t reserved; 238 + }; 239 + 240 + enum pvrdma_cq_notify_flags { 241 + PVRDMA_CQ_SOLICITED = 1 << 0, 242 + PVRDMA_CQ_NEXT_COMP = 1 << 1, 243 + PVRDMA_CQ_SOLICITED_MASK = PVRDMA_CQ_SOLICITED | 244 + PVRDMA_CQ_NEXT_COMP, 245 + PVRDMA_CQ_REPORT_MISSED_EVENTS = 1 << 2, 246 + }; 247 + 248 + struct pvrdma_qp_cap { 249 + uint32_t max_send_wr; 250 + uint32_t max_recv_wr; 251 + uint32_t max_send_sge; 252 + uint32_t max_recv_sge; 253 + uint32_t max_inline_data; 254 + uint32_t reserved; 255 + }; 256 + 257 + enum pvrdma_sig_type { 258 + PVRDMA_SIGNAL_ALL_WR, 259 + PVRDMA_SIGNAL_REQ_WR, 260 + }; 261 + 262 + enum pvrdma_qp_type { 263 + PVRDMA_QPT_SMI, 264 + PVRDMA_QPT_GSI, 265 + PVRDMA_QPT_RC, 266 + PVRDMA_QPT_UC, 267 + PVRDMA_QPT_UD, 268 + PVRDMA_QPT_RAW_IPV6, 269 + PVRDMA_QPT_RAW_ETHERTYPE, 270 + PVRDMA_QPT_RAW_PACKET = 8, 271 + PVRDMA_QPT_XRC_INI = 9, 272 + PVRDMA_QPT_XRC_TGT, 273 + PVRDMA_QPT_MAX, 274 + }; 275 + 276 + enum pvrdma_qp_create_flags { 277 + PVRDMA_QP_CREATE_IPOPVRDMA_UD_LSO = 1 << 0, 278 + PVRDMA_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, 279 + }; 280 + 281 + enum pvrdma_qp_attr_mask { 282 + PVRDMA_QP_STATE = 1 << 0, 283 + PVRDMA_QP_CUR_STATE = 1 << 1, 284 + PVRDMA_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, 285 + PVRDMA_QP_ACCESS_FLAGS = 1 << 3, 286 + PVRDMA_QP_PKEY_INDEX = 1 << 4, 287 + PVRDMA_QP_PORT = 1 << 5, 288 + PVRDMA_QP_QKEY = 1 << 6, 289 + PVRDMA_QP_AV = 1 << 7, 290 + PVRDMA_QP_PATH_MTU = 1 << 8, 291 + PVRDMA_QP_TIMEOUT = 1 << 9, 292 + PVRDMA_QP_RETRY_CNT = 1 << 10, 293 + PVRDMA_QP_RNR_RETRY = 1 << 11, 294 + PVRDMA_QP_RQ_PSN = 1 << 12, 295 + PVRDMA_QP_MAX_QP_RD_ATOMIC = 1 << 13, 296 + PVRDMA_QP_ALT_PATH = 1 << 14, 297 + PVRDMA_QP_MIN_RNR_TIMER = 1 << 15, 298 + PVRDMA_QP_SQ_PSN = 1 << 16, 299 + PVRDMA_QP_MAX_DEST_RD_ATOMIC = 1 << 17, 300 + PVRDMA_QP_PATH_MIG_STATE = 1 << 18, 301 + PVRDMA_QP_CAP = 1 << 19, 302 + PVRDMA_QP_DEST_QPN = 1 << 20, 303 + PVRDMA_QP_ATTR_MASK_MAX = PVRDMA_QP_DEST_QPN, 304 + }; 305 + 306 + enum pvrdma_qp_state { 307 + PVRDMA_QPS_RESET, 308 + PVRDMA_QPS_INIT, 309 + PVRDMA_QPS_RTR, 310 + PVRDMA_QPS_RTS, 311 + PVRDMA_QPS_SQD, 312 + PVRDMA_QPS_SQE, 313 + PVRDMA_QPS_ERR, 314 + }; 315 + 316 + enum pvrdma_mig_state { 317 + PVRDMA_MIG_MIGRATED, 318 + PVRDMA_MIG_REARM, 319 + PVRDMA_MIG_ARMED, 320 + }; 321 + 322 + enum pvrdma_mw_type { 323 + PVRDMA_MW_TYPE_1 = 1, 324 + PVRDMA_MW_TYPE_2 = 2, 325 + }; 326 + 327 + struct pvrdma_srq_attr { 328 + uint32_t max_wr; 329 + uint32_t max_sge; 330 + uint32_t srq_limit; 331 + uint32_t reserved; 332 + }; 333 + 334 + struct pvrdma_qp_attr { 335 + enum pvrdma_qp_state qp_state; 336 + enum pvrdma_qp_state cur_qp_state; 337 + enum pvrdma_mtu path_mtu; 338 + enum pvrdma_mig_state path_mig_state; 339 + uint32_t qkey; 340 + uint32_t rq_psn; 341 + uint32_t sq_psn; 342 + uint32_t dest_qp_num; 343 + uint32_t qp_access_flags; 344 + uint16_t pkey_index; 345 + uint16_t alt_pkey_index; 346 + uint8_t en_sqd_async_notify; 347 + uint8_t sq_draining; 348 + uint8_t max_rd_atomic; 349 + uint8_t max_dest_rd_atomic; 350 + uint8_t min_rnr_timer; 351 + uint8_t port_num; 352 + uint8_t timeout; 353 + uint8_t retry_cnt; 354 + uint8_t rnr_retry; 355 + uint8_t alt_port_num; 356 + uint8_t alt_timeout; 357 + uint8_t reserved[5]; 358 + struct pvrdma_qp_cap cap; 359 + struct pvrdma_ah_attr ah_attr; 360 + struct pvrdma_ah_attr alt_ah_attr; 361 + }; 362 + 363 + enum pvrdma_send_flags { 364 + PVRDMA_SEND_FENCE = 1 << 0, 365 + PVRDMA_SEND_SIGNALED = 1 << 1, 366 + PVRDMA_SEND_SOLICITED = 1 << 2, 367 + PVRDMA_SEND_INLINE = 1 << 3, 368 + PVRDMA_SEND_IP_CSUM = 1 << 4, 369 + PVRDMA_SEND_FLAGS_MAX = PVRDMA_SEND_IP_CSUM, 370 + }; 371 + 372 + enum pvrdma_access_flags { 373 + PVRDMA_ACCESS_LOCAL_WRITE = 1 << 0, 374 + PVRDMA_ACCESS_REMOTE_WRITE = 1 << 1, 375 + PVRDMA_ACCESS_REMOTE_READ = 1 << 2, 376 + PVRDMA_ACCESS_REMOTE_ATOMIC = 1 << 3, 377 + PVRDMA_ACCESS_MW_BIND = 1 << 4, 378 + PVRDMA_ZERO_BASED = 1 << 5, 379 + PVRDMA_ACCESS_ON_DEMAND = 1 << 6, 380 + PVRDMA_ACCESS_FLAGS_MAX = PVRDMA_ACCESS_ON_DEMAND, 381 + }; 382 + 383 + #endif /* __PVRDMA_VERBS_H__ */
+293
include/standard-headers/rdma/vmw_pvrdma-abi.h
··· 1 + /* 2 + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of EITHER the GNU General Public License 6 + * version 2 as published by the Free Software Foundation or the BSD 7 + * 2-Clause License. This program is distributed in the hope that it 8 + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED 9 + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 10 + * See the GNU General Public License version 2 for more details at 11 + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program available in the file COPYING in the main 15 + * directory of this source tree. 16 + * 17 + * The BSD 2-Clause License 18 + * 19 + * Redistribution and use in source and binary forms, with or 20 + * without modification, are permitted provided that the following 21 + * conditions are met: 22 + * 23 + * - Redistributions of source code must retain the above 24 + * copyright notice, this list of conditions and the following 25 + * disclaimer. 26 + * 27 + * - Redistributions in binary form must reproduce the above 28 + * copyright notice, this list of conditions and the following 29 + * disclaimer in the documentation and/or other materials 30 + * provided with the distribution. 31 + * 32 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 33 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 34 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 35 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 36 + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 37 + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 39 + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 41 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 43 + * OF THE POSSIBILITY OF SUCH DAMAGE. 44 + */ 45 + 46 + #ifndef __VMW_PVRDMA_ABI_H__ 47 + #define __VMW_PVRDMA_ABI_H__ 48 + 49 + #include "standard-headers/linux/types.h" 50 + 51 + #define PVRDMA_UVERBS_ABI_VERSION 3 /* ABI Version. */ 52 + #define PVRDMA_UAR_HANDLE_MASK 0x00FFFFFF /* Bottom 24 bits. */ 53 + #define PVRDMA_UAR_QP_OFFSET 0 /* QP doorbell. */ 54 + #define PVRDMA_UAR_QP_SEND BIT(30) /* Send bit. */ 55 + #define PVRDMA_UAR_QP_RECV BIT(31) /* Recv bit. */ 56 + #define PVRDMA_UAR_CQ_OFFSET 4 /* CQ doorbell. */ 57 + #define PVRDMA_UAR_CQ_ARM_SOL BIT(29) /* Arm solicited bit. */ 58 + #define PVRDMA_UAR_CQ_ARM BIT(30) /* Arm bit. */ 59 + #define PVRDMA_UAR_CQ_POLL BIT(31) /* Poll bit. */ 60 + 61 + enum pvrdma_wr_opcode { 62 + PVRDMA_WR_RDMA_WRITE, 63 + PVRDMA_WR_RDMA_WRITE_WITH_IMM, 64 + PVRDMA_WR_SEND, 65 + PVRDMA_WR_SEND_WITH_IMM, 66 + PVRDMA_WR_RDMA_READ, 67 + PVRDMA_WR_ATOMIC_CMP_AND_SWP, 68 + PVRDMA_WR_ATOMIC_FETCH_AND_ADD, 69 + PVRDMA_WR_LSO, 70 + PVRDMA_WR_SEND_WITH_INV, 71 + PVRDMA_WR_RDMA_READ_WITH_INV, 72 + PVRDMA_WR_LOCAL_INV, 73 + PVRDMA_WR_FAST_REG_MR, 74 + PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP, 75 + PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD, 76 + PVRDMA_WR_BIND_MW, 77 + PVRDMA_WR_REG_SIG_MR, 78 + }; 79 + 80 + enum pvrdma_wc_status { 81 + PVRDMA_WC_SUCCESS, 82 + PVRDMA_WC_LOC_LEN_ERR, 83 + PVRDMA_WC_LOC_QP_OP_ERR, 84 + PVRDMA_WC_LOC_EEC_OP_ERR, 85 + PVRDMA_WC_LOC_PROT_ERR, 86 + PVRDMA_WC_WR_FLUSH_ERR, 87 + PVRDMA_WC_MW_BIND_ERR, 88 + PVRDMA_WC_BAD_RESP_ERR, 89 + PVRDMA_WC_LOC_ACCESS_ERR, 90 + PVRDMA_WC_REM_INV_REQ_ERR, 91 + PVRDMA_WC_REM_ACCESS_ERR, 92 + PVRDMA_WC_REM_OP_ERR, 93 + PVRDMA_WC_RETRY_EXC_ERR, 94 + PVRDMA_WC_RNR_RETRY_EXC_ERR, 95 + PVRDMA_WC_LOC_RDD_VIOL_ERR, 96 + PVRDMA_WC_REM_INV_RD_REQ_ERR, 97 + PVRDMA_WC_REM_ABORT_ERR, 98 + PVRDMA_WC_INV_EECN_ERR, 99 + PVRDMA_WC_INV_EEC_STATE_ERR, 100 + PVRDMA_WC_FATAL_ERR, 101 + PVRDMA_WC_RESP_TIMEOUT_ERR, 102 + PVRDMA_WC_GENERAL_ERR, 103 + }; 104 + 105 + enum pvrdma_wc_opcode { 106 + PVRDMA_WC_SEND, 107 + PVRDMA_WC_RDMA_WRITE, 108 + PVRDMA_WC_RDMA_READ, 109 + PVRDMA_WC_COMP_SWAP, 110 + PVRDMA_WC_FETCH_ADD, 111 + PVRDMA_WC_BIND_MW, 112 + PVRDMA_WC_LSO, 113 + PVRDMA_WC_LOCAL_INV, 114 + PVRDMA_WC_FAST_REG_MR, 115 + PVRDMA_WC_MASKED_COMP_SWAP, 116 + PVRDMA_WC_MASKED_FETCH_ADD, 117 + PVRDMA_WC_RECV = 1 << 7, 118 + PVRDMA_WC_RECV_RDMA_WITH_IMM, 119 + }; 120 + 121 + enum pvrdma_wc_flags { 122 + PVRDMA_WC_GRH = 1 << 0, 123 + PVRDMA_WC_WITH_IMM = 1 << 1, 124 + PVRDMA_WC_WITH_INVALIDATE = 1 << 2, 125 + PVRDMA_WC_IP_CSUM_OK = 1 << 3, 126 + PVRDMA_WC_WITH_SMAC = 1 << 4, 127 + PVRDMA_WC_WITH_VLAN = 1 << 5, 128 + PVRDMA_WC_WITH_NETWORK_HDR_TYPE = 1 << 6, 129 + PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_NETWORK_HDR_TYPE, 130 + }; 131 + 132 + struct pvrdma_alloc_ucontext_resp { 133 + uint32_t qp_tab_size; 134 + uint32_t reserved; 135 + }; 136 + 137 + struct pvrdma_alloc_pd_resp { 138 + uint32_t pdn; 139 + uint32_t reserved; 140 + }; 141 + 142 + struct pvrdma_create_cq { 143 + uint64_t buf_addr; 144 + uint32_t buf_size; 145 + uint32_t reserved; 146 + }; 147 + 148 + struct pvrdma_create_cq_resp { 149 + uint32_t cqn; 150 + uint32_t reserved; 151 + }; 152 + 153 + struct pvrdma_resize_cq { 154 + uint64_t buf_addr; 155 + uint32_t buf_size; 156 + uint32_t reserved; 157 + }; 158 + 159 + struct pvrdma_create_srq { 160 + uint64_t buf_addr; 161 + uint32_t buf_size; 162 + uint32_t reserved; 163 + }; 164 + 165 + struct pvrdma_create_srq_resp { 166 + uint32_t srqn; 167 + uint32_t reserved; 168 + }; 169 + 170 + struct pvrdma_create_qp { 171 + uint64_t rbuf_addr; 172 + uint64_t sbuf_addr; 173 + uint32_t rbuf_size; 174 + uint32_t sbuf_size; 175 + uint64_t qp_addr; 176 + }; 177 + 178 + /* PVRDMA masked atomic compare and swap */ 179 + struct pvrdma_ex_cmp_swap { 180 + uint64_t swap_val; 181 + uint64_t compare_val; 182 + uint64_t swap_mask; 183 + uint64_t compare_mask; 184 + }; 185 + 186 + /* PVRDMA masked atomic fetch and add */ 187 + struct pvrdma_ex_fetch_add { 188 + uint64_t add_val; 189 + uint64_t field_boundary; 190 + }; 191 + 192 + /* PVRDMA address vector. */ 193 + struct pvrdma_av { 194 + uint32_t port_pd; 195 + uint32_t sl_tclass_flowlabel; 196 + uint8_t dgid[16]; 197 + uint8_t src_path_bits; 198 + uint8_t gid_index; 199 + uint8_t stat_rate; 200 + uint8_t hop_limit; 201 + uint8_t dmac[6]; 202 + uint8_t reserved[6]; 203 + }; 204 + 205 + /* PVRDMA scatter/gather entry */ 206 + struct pvrdma_sge { 207 + uint64_t addr; 208 + uint32_t length; 209 + uint32_t lkey; 210 + }; 211 + 212 + /* PVRDMA receive queue work request */ 213 + struct pvrdma_rq_wqe_hdr { 214 + uint64_t wr_id; /* wr id */ 215 + uint32_t num_sge; /* size of s/g array */ 216 + uint32_t total_len; /* reserved */ 217 + }; 218 + /* Use pvrdma_sge (ib_sge) for receive queue s/g array elements. */ 219 + 220 + /* PVRDMA send queue work request */ 221 + struct pvrdma_sq_wqe_hdr { 222 + uint64_t wr_id; /* wr id */ 223 + uint32_t num_sge; /* size of s/g array */ 224 + uint32_t total_len; /* reserved */ 225 + uint32_t opcode; /* operation type */ 226 + uint32_t send_flags; /* wr flags */ 227 + union { 228 + uint32_t imm_data; 229 + uint32_t invalidate_rkey; 230 + } ex; 231 + uint32_t reserved; 232 + union { 233 + struct { 234 + uint64_t remote_addr; 235 + uint32_t rkey; 236 + uint8_t reserved[4]; 237 + } rdma; 238 + struct { 239 + uint64_t remote_addr; 240 + uint64_t compare_add; 241 + uint64_t swap; 242 + uint32_t rkey; 243 + uint32_t reserved; 244 + } atomic; 245 + struct { 246 + uint64_t remote_addr; 247 + uint32_t log_arg_sz; 248 + uint32_t rkey; 249 + union { 250 + struct pvrdma_ex_cmp_swap cmp_swap; 251 + struct pvrdma_ex_fetch_add fetch_add; 252 + } wr_data; 253 + } masked_atomics; 254 + struct { 255 + uint64_t iova_start; 256 + uint64_t pl_pdir_dma; 257 + uint32_t page_shift; 258 + uint32_t page_list_len; 259 + uint32_t length; 260 + uint32_t access_flags; 261 + uint32_t rkey; 262 + } fast_reg; 263 + struct { 264 + uint32_t remote_qpn; 265 + uint32_t remote_qkey; 266 + struct pvrdma_av av; 267 + } ud; 268 + } wr; 269 + }; 270 + /* Use pvrdma_sge (ib_sge) for send queue s/g array elements. */ 271 + 272 + /* Completion queue element. */ 273 + struct pvrdma_cqe { 274 + uint64_t wr_id; 275 + uint64_t qp; 276 + uint32_t opcode; 277 + uint32_t status; 278 + uint32_t byte_len; 279 + uint32_t imm_data; 280 + uint32_t src_qp; 281 + uint32_t wc_flags; 282 + uint32_t vendor_err; 283 + uint16_t pkey_index; 284 + uint16_t slid; 285 + uint8_t sl; 286 + uint8_t dlid_path_bits; 287 + uint8_t port_num; 288 + uint8_t smac[6]; 289 + uint8_t network_hdr_type; 290 + uint8_t reserved2[6]; /* Pad to next power of 2 (64). */ 291 + }; 292 + 293 + #endif /* __VMW_PVRDMA_ABI_H__ */
+1 -1
include/sysemu/hostmem.h
··· 54 54 char *id; 55 55 uint64_t size; 56 56 bool merge, dump; 57 - bool prealloc, force_prealloc, is_mapped; 57 + bool prealloc, force_prealloc, is_mapped, share; 58 58 DECLARE_BITMAP(host_nodes, MAX_NODES + 1); 59 59 HostMemPolicy policy; 60 60
+1 -1
include/sysemu/kvm.h
··· 248 248 249 249 /* interface with exec.c */ 250 250 251 - void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align)); 251 + void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared)); 252 252 253 253 /* internal API */ 254 254
+13 -3
memory.c
··· 1539 1539 uint64_t size, 1540 1540 Error **errp) 1541 1541 { 1542 + memory_region_init_ram_shared_nomigrate(mr, owner, name, size, false, errp); 1543 + } 1544 + 1545 + void memory_region_init_ram_shared_nomigrate(MemoryRegion *mr, 1546 + Object *owner, 1547 + const char *name, 1548 + uint64_t size, 1549 + bool share, 1550 + Error **errp) 1551 + { 1542 1552 memory_region_init(mr, owner, name, size); 1543 1553 mr->ram = true; 1544 1554 mr->terminates = true; 1545 1555 mr->destructor = memory_region_destructor_ram; 1546 - mr->ram_block = qemu_ram_alloc(size, mr, errp); 1556 + mr->ram_block = qemu_ram_alloc(size, share, mr, errp); 1547 1557 mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0; 1548 1558 } 1549 1559 ··· 1654 1664 mr->readonly = true; 1655 1665 mr->terminates = true; 1656 1666 mr->destructor = memory_region_destructor_ram; 1657 - mr->ram_block = qemu_ram_alloc(size, mr, errp); 1667 + mr->ram_block = qemu_ram_alloc(size, false, mr, errp); 1658 1668 mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0; 1659 1669 } 1660 1670 ··· 1673 1683 mr->terminates = true; 1674 1684 mr->rom_device = true; 1675 1685 mr->destructor = memory_region_destructor_ram; 1676 - mr->ram_block = qemu_ram_alloc(size, mr, errp); 1686 + mr->ram_block = qemu_ram_alloc(size, false, mr, errp); 1677 1687 } 1678 1688 1679 1689 void memory_region_init_iommu(void *_iommu_mr,
+9 -1
qemu-options.hx
··· 3975 3975 region is marked as private to QEMU, or shared. The latter allows 3976 3976 a co-operating external process to access the QEMU memory region. 3977 3977 3978 + The @option{share} is also required for pvrdma devices due to 3979 + limitations in the RDMA API provided by Linux. 3980 + 3981 + Setting share=on might affect the ability to configure NUMA 3982 + bindings for the memory backend under some circumstances, see 3983 + Documentation/vm/numa_memory_policy.txt on the Linux kernel 3984 + source tree for additional details. 3985 + 3978 3986 Setting the @option{discard-data} boolean option to @var{on} 3979 3987 indicates that file contents can be destroyed when QEMU exits, 3980 3988 to avoid unnecessarily flushing data to the backing file. Note ··· 4017 4025 the device DAX /dev/dax0.0 requires 2M alignment rather than 4K. In 4018 4026 such cases, users can specify the required alignment via this option. 4019 4027 4020 - @item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} 4028 + @item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},share=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} 4021 4029 4022 4030 Creates a memory backend object, which can be used to back the guest RAM. 4023 4031 Memory backend objects offer more control than the @option{-m} option that is
+30
scripts/update-linux-headers.sh
··· 38 38 -e 'linux/if_ether' \ 39 39 -e 'input-event-codes' \ 40 40 -e 'sys/' \ 41 + -e 'pvrdma_verbs' \ 41 42 > /dev/null 42 43 then 43 44 echo "Unexpected #include in input file $f". ··· 46 47 47 48 header=$(basename "$f"); 48 49 sed -e 's/__u\([0-9][0-9]*\)/uint\1_t/g' \ 50 + -e 's/u\([0-9][0-9]*\)/uint\1_t/g' \ 49 51 -e 's/__s\([0-9][0-9]*\)/int\1_t/g' \ 50 52 -e 's/__le\([0-9][0-9]*\)/uint\1_t/g' \ 51 53 -e 's/__be\([0-9][0-9]*\)/uint\1_t/g' \ ··· 56 58 -e 's/__inline__/inline/' \ 57 59 -e '/sys\/ioctl.h/d' \ 58 60 -e 's/SW_MAX/SW_MAX_/' \ 61 + -e 's/atomic_t/int/' \ 59 62 "$f" > "$to/$header"; 60 63 } 61 64 ··· 145 148 "$tmpdir/include/linux/input-event-codes.h" \ 146 149 "$tmpdir/include/linux/pci_regs.h"; do 147 150 cp_portable "$i" "$output/include/standard-headers/linux" 151 + done 152 + 153 + rm -rf "$output/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma" 154 + mkdir -p "$output/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma" 155 + 156 + # Remove the unused functions from pvrdma_verbs.h avoiding the unnecessary 157 + # import of several infiniband/networking/other headers 158 + tmp_pvrdma_verbs="$tmpdir/pvrdma_verbs.h" 159 + # Parse the entire file instead of single lines to match 160 + # function declarations expanding over multiple lines 161 + # and strip the declarations starting with pvrdma prefix. 162 + sed -e '1h;2,$H;$!d;g' -e 's/[^};]*pvrdma[^(| ]*([^)]*);//g' \ 163 + "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h" > \ 164 + "$tmp_pvrdma_verbs"; 165 + 166 + for i in "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h" \ 167 + "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" \ 168 + "$tmp_pvrdma_verbs"; do \ 169 + cp_portable "$i" \ 170 + "$output/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/" 171 + done 172 + 173 + rm -rf "$output/include/standard-headers/rdma/" 174 + mkdir -p "$output/include/standard-headers/rdma/" 175 + for i in "$tmpdir/include/rdma/vmw_pvrdma-abi.h"; do 176 + cp_portable "$i" \ 177 + "$output/include/standard-headers/rdma/" 148 178 done 149 179 150 180 cat <<EOF >$output/include/standard-headers/linux/types.h
+2 -2
target/s390x/kvm.c
··· 144 144 145 145 static int active_cmma; 146 146 147 - static void *legacy_s390_alloc(size_t size, uint64_t *align); 147 + static void *legacy_s390_alloc(size_t size, uint64_t *align, bool shared); 148 148 149 149 static int kvm_s390_query_mem_limit(uint64_t *memory_limit) 150 150 { ··· 752 752 * to grow. We also have to use MAP parameters that avoid 753 753 * read-only mapping of guest pages. 754 754 */ 755 - static void *legacy_s390_alloc(size_t size, uint64_t *align) 755 + static void *legacy_s390_alloc(size_t size, uint64_t *align, bool shared) 756 756 { 757 757 void *mem; 758 758
+2 -2
util/oslib-posix.c
··· 127 127 } 128 128 129 129 /* alloc shared memory pages */ 130 - void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment) 130 + void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared) 131 131 { 132 132 size_t align = QEMU_VMALLOC_ALIGN; 133 - void *ptr = qemu_ram_mmap(-1, size, align, false); 133 + void *ptr = qemu_ram_mmap(-1, size, align, shared); 134 134 135 135 if (ptr == MAP_FAILED) { 136 136 return NULL;
+1 -1
util/oslib-win32.c
··· 67 67 return qemu_oom_check(qemu_try_memalign(alignment, size)); 68 68 } 69 69 70 - void *qemu_anon_ram_alloc(size_t size, uint64_t *align) 70 + void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared) 71 71 { 72 72 void *ptr; 73 73