qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

9p: Added virtfs option 'multidevs=remap|forbid|warn'

'warn' (default): Only log an error message (once) on host if more than one
device is shared by same export, except of that just ignore this config
error though. This is the default behaviour for not breaking existing
installations implying that they really know what they are doing.

'forbid': Like 'warn', but except of just logging an error this
also denies access of guest to additional devices.

'remap': Allows to share more than one device per export by remapping
inodes from host to guest appropriately. To support multiple devices on the
9p share, and avoid qid path collisions we take the device id as input to
generate a unique QID path. The lowest 48 bits of the path will be set
equal to the file inode, and the top bits will be uniquely assigned based
on the top 16 bits of the inode and the device id.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
[CS: - Rebased to https://github.com/gkurz/qemu/commits/9p-next
(SHA1 7fc4c49e91).
- Added virtfs option 'multidevs', original patch simply did the inode
remapping without being asked.
- Updated hash calls to new xxhash API.
- Updated docs for new option 'multidevs'.
- Fixed v9fs_do_readdir() not having remapped inodes.
- Log error message when running out of prefixes in
qid_path_prefixmap().
- Fixed definition of QPATH_INO_MASK.
- Wrapped qpp_table initialization to dedicated qpp_table_init()
function.
- Dropped unnecessary parantheses in qpp_lookup_func().
- Dropped unnecessary g_malloc0() result checks. ]
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
[groug: - Moved "multidevs" parsing to the local backend.
- Added hint to invalid multidevs option error.
- Turn "remap" into "x-remap". ]
Signed-off-by: Greg Kurz <groug@kaod.org>

authored by

Antonios Motakis and committed by
Greg Kurz
1a6ed33c 3b5ee9e8

+242 -25
+5
fsdev/file-op-9p.h
··· 59 59 #define V9FS_RDONLY 0x00000040 60 60 #define V9FS_PROXY_SOCK_FD 0x00000080 61 61 #define V9FS_PROXY_SOCK_NAME 0x00000100 62 + /* 63 + * multidevs option (either one of the two applies exclusively) 64 + */ 65 + #define V9FS_REMAP_INODES 0x00000200 66 + #define V9FS_FORBID_MULTIDEVS 0x00000400 62 67 63 68 #define V9FS_SEC_MASK 0x0000003C 64 69
+6 -1
fsdev/qemu-fsdev-opts.c
··· 31 31 }, { 32 32 .name = "readonly", 33 33 .type = QEMU_OPT_BOOL, 34 - 34 + }, { 35 + .name = "multidevs", 36 + .type = QEMU_OPT_STRING, 35 37 }, { 36 38 .name = "socket", 37 39 .type = QEMU_OPT_STRING, ··· 75 77 }, { 76 78 .name = "readonly", 77 79 .type = QEMU_OPT_BOOL, 80 + }, { 81 + .name = "multidevs", 82 + .type = QEMU_OPT_STRING, 78 83 }, { 79 84 .name = "socket", 80 85 .type = QEMU_OPT_STRING,
+1
fsdev/qemu-fsdev.c
··· 58 58 "writeout", 59 59 "fmode", 60 60 "dmode", 61 + "multidevs", 61 62 "throttling.bps-total", 62 63 "throttling.bps-read", 63 64 "throttling.bps-write",
+21
hw/9pfs/9p-local.c
··· 1483 1483 { 1484 1484 const char *sec_model = qemu_opt_get(opts, "security_model"); 1485 1485 const char *path = qemu_opt_get(opts, "path"); 1486 + const char *multidevs = qemu_opt_get(opts, "multidevs"); 1486 1487 Error *local_err = NULL; 1487 1488 1488 1489 if (!sec_model) { ··· 1504 1505 error_setg(errp, "invalid security_model property '%s'", sec_model); 1505 1506 error_append_security_model_hint(errp); 1506 1507 return -1; 1508 + } 1509 + 1510 + if (multidevs) { 1511 + if (!strcmp(multidevs, "remap")) { 1512 + fse->export_flags &= ~V9FS_FORBID_MULTIDEVS; 1513 + fse->export_flags |= V9FS_REMAP_INODES; 1514 + } else if (!strcmp(multidevs, "forbid")) { 1515 + fse->export_flags &= ~V9FS_REMAP_INODES; 1516 + fse->export_flags |= V9FS_FORBID_MULTIDEVS; 1517 + } else if (!strcmp(multidevs, "warn")) { 1518 + fse->export_flags &= ~V9FS_FORBID_MULTIDEVS; 1519 + fse->export_flags &= ~V9FS_REMAP_INODES; 1520 + } else { 1521 + error_setg(&local_err, "invalid multidevs property '%s'", 1522 + multidevs); 1523 + error_append_hint(&local_err, "Valid options are: multidevs=" 1524 + "[remap|forbid|warn]\n"); 1525 + error_propagate(errp, local_err); 1526 + return -1; 1527 + } 1507 1528 } 1508 1529 1509 1530 if (!path) {
+167 -21
hw/9pfs/9p.c
··· 26 26 #include "trace.h" 27 27 #include "migration/blocker.h" 28 28 #include "sysemu/qtest.h" 29 + #include "qemu/xxhash.h" 29 30 30 31 int open_fd_hw; 31 32 int total_open_fd; ··· 572 573 P9_STAT_MODE_NAMED_PIPE | \ 573 574 P9_STAT_MODE_SOCKET) 574 575 575 - /* This is the algorithm from ufs in spfs */ 576 + /* creative abuse of tb_hash_func7, which is based on xxhash */ 577 + static uint32_t qpp_hash(QppEntry e) 578 + { 579 + return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0); 580 + } 581 + 582 + static bool qpp_lookup_func(const void *obj, const void *userp) 583 + { 584 + const QppEntry *e1 = obj, *e2 = userp; 585 + return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix; 586 + } 587 + 588 + static void qpp_table_remove(void *p, uint32_t h, void *up) 589 + { 590 + g_free(p); 591 + } 592 + 593 + static void qpp_table_destroy(struct qht *ht) 594 + { 595 + if (!ht || !ht->map) { 596 + return; 597 + } 598 + qht_iter(ht, qpp_table_remove, NULL); 599 + qht_destroy(ht); 600 + } 601 + 602 + static void qpp_table_init(struct qht *ht) 603 + { 604 + qht_init(ht, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE); 605 + } 606 + 607 + /* 608 + * stat_to_qid needs to map inode number (64 bits) and device id (32 bits) 609 + * to a unique QID path (64 bits). To avoid having to map and keep track 610 + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus 611 + * the device id to the 16 highest bits of the QID path. The 48 lowest bits 612 + * of the QID path equal to the lowest bits of the inode number. 613 + * 614 + * This takes advantage of the fact that inode number are usually not 615 + * random but allocated sequentially, so we have fewer items to keep 616 + * track of. 617 + */ 618 + static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf, 619 + uint64_t *path) 620 + { 621 + QppEntry lookup = { 622 + .dev = stbuf->st_dev, 623 + .ino_prefix = (uint16_t) (stbuf->st_ino >> 48) 624 + }, *val; 625 + uint32_t hash = qpp_hash(lookup); 626 + 627 + val = qht_lookup(&pdu->s->qpp_table, &lookup, hash); 628 + 629 + if (!val) { 630 + if (pdu->s->qp_prefix_next == 0) { 631 + /* we ran out of prefixes */ 632 + error_report_once( 633 + "9p: No more prefixes available for remapping inodes from " 634 + "host to guest." 635 + ); 636 + return -ENFILE; 637 + } 638 + 639 + val = g_malloc0(sizeof(QppEntry)); 640 + *val = lookup; 641 + 642 + /* new unique inode prefix and device combo */ 643 + val->qp_prefix = pdu->s->qp_prefix_next++; 644 + qht_insert(&pdu->s->qpp_table, val, hash, NULL); 645 + } 646 + 647 + *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK); 648 + return 0; 649 + } 650 + 576 651 static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp) 577 652 { 653 + int err; 578 654 size_t size; 579 655 580 - if (pdu->s->dev_id != stbuf->st_dev) { 581 - warn_report_once( 582 - "9p: Multiple devices detected in same VirtFS export, " 583 - "which might lead to file ID collisions and severe " 584 - "misbehaviours on guest! You should use a separate " 585 - "export for each device shared from host." 586 - ); 656 + if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) { 657 + /* map inode+device to qid path (fast path) */ 658 + err = qid_path_prefixmap(pdu, stbuf, &qidp->path); 659 + if (err) { 660 + return err; 661 + } 662 + } else { 663 + if (pdu->s->dev_id != stbuf->st_dev) { 664 + if (pdu->s->ctx.export_flags & V9FS_FORBID_MULTIDEVS) { 665 + error_report_once( 666 + "9p: Multiple devices detected in same VirtFS export. " 667 + "Access of guest to additional devices is (partly) " 668 + "denied due to virtfs option 'multidevs=forbid' being " 669 + "effective." 670 + ); 671 + return -ENODEV; 672 + } else { 673 + warn_report_once( 674 + "9p: Multiple devices detected in same VirtFS export, " 675 + "which might lead to file ID collisions and severe " 676 + "misbehaviours on guest! You should either use a " 677 + "separate export for each device shared from host or " 678 + "use virtfs option 'multidevs=remap'!" 679 + ); 680 + } 681 + } 682 + memset(&qidp->path, 0, sizeof(qidp->path)); 683 + size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path)); 684 + memcpy(&qidp->path, &stbuf->st_ino, size); 587 685 } 588 686 589 - memset(&qidp->path, 0, sizeof(qidp->path)); 590 - size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path)); 591 - memcpy(&qidp->path, &stbuf->st_ino, size); 592 687 qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8); 593 688 qidp->type = 0; 594 689 if (S_ISDIR(stbuf->st_mode)) { ··· 616 711 return err; 617 712 } 618 713 return 0; 714 + } 715 + 716 + static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, 717 + struct dirent *dent, V9fsQID *qidp) 718 + { 719 + struct stat stbuf; 720 + V9fsPath path; 721 + int err; 722 + 723 + v9fs_path_init(&path); 724 + 725 + err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path); 726 + if (err < 0) { 727 + goto out; 728 + } 729 + err = v9fs_co_lstat(pdu, &path, &stbuf); 730 + if (err < 0) { 731 + goto out; 732 + } 733 + err = stat_to_qid(pdu, &stbuf, qidp); 734 + 735 + out: 736 + v9fs_path_free(&path); 737 + return err; 619 738 } 620 739 621 740 V9fsPDU *pdu_alloc(V9fsState *s) ··· 1966 2085 v9fs_string_free(&name); 1967 2086 return count; 1968 2087 } 1969 - /* 1970 - * Fill up just the path field of qid because the client uses 1971 - * only that. To fill the entire qid structure we will have 1972 - * to stat each dirent found, which is expensive 1973 - */ 1974 - size = MIN(sizeof(dent->d_ino), sizeof(qid.path)); 1975 - memcpy(&qid.path, &dent->d_ino, size); 1976 - /* Fill the other fields with dummy values */ 1977 - qid.type = 0; 1978 - qid.version = 0; 2088 + 2089 + if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) { 2090 + /* 2091 + * dirent_to_qid() implies expensive stat call for each entry, 2092 + * we must do that here though since inode remapping requires 2093 + * the device id, which in turn might be different for 2094 + * different entries; we cannot make any assumption to avoid 2095 + * that here. 2096 + */ 2097 + err = dirent_to_qid(pdu, fidp, dent, &qid); 2098 + if (err < 0) { 2099 + v9fs_readdir_unlock(&fidp->fs.dir); 2100 + v9fs_co_seekdir(pdu, fidp, saved_dir_pos); 2101 + v9fs_string_free(&name); 2102 + return err; 2103 + } 2104 + } else { 2105 + /* 2106 + * Fill up just the path field of qid because the client uses 2107 + * only that. To fill the entire qid structure we will have 2108 + * to stat each dirent found, which is expensive. For the 2109 + * latter reason we don't call dirent_to_qid() here. Only drawback 2110 + * is that no multi-device export detection of stat_to_qid() 2111 + * would be done and provided as error to the user here. But 2112 + * user would get that error anyway when accessing those 2113 + * files/dirs through other ways. 2114 + */ 2115 + size = MIN(sizeof(dent->d_ino), sizeof(qid.path)); 2116 + memcpy(&qid.path, &dent->d_ino, size); 2117 + /* Fill the other fields with dummy values */ 2118 + qid.type = 0; 2119 + qid.version = 0; 2120 + } 1979 2121 1980 2122 /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */ 1981 2123 len = pdu_marshal(pdu, 11 + count, "Qqbs", ··· 3676 3818 3677 3819 s->dev_id = stat.st_dev; 3678 3820 3821 + qpp_table_init(&s->qpp_table); 3822 + s->qp_prefix_next = 1; /* reserve 0 to detect overflow */ 3823 + 3679 3824 s->ctx.fst = &fse->fst; 3680 3825 fsdev_throttle_init(s->ctx.fst); 3681 3826 ··· 3697 3842 fsdev_throttle_cleanup(s->ctx.fst); 3698 3843 } 3699 3844 g_free(s->tag); 3845 + qpp_table_destroy(&s->qpp_table); 3700 3846 g_free(s->ctx.fs_root); 3701 3847 } 3702 3848
+12
hw/9pfs/9p.h
··· 8 8 #include "fsdev/9p-iov-marshal.h" 9 9 #include "qemu/thread.h" 10 10 #include "qemu/coroutine.h" 11 + #include "qemu/qht.h" 11 12 12 13 enum { 13 14 P9_TLERROR = 6, ··· 235 236 V9fsFidState *rclm_lst; 236 237 }; 237 238 239 + #define QPATH_INO_MASK ((1ULL << 48) - 1) 240 + 241 + /* QID path prefix entry, see stat_to_qid */ 242 + typedef struct { 243 + dev_t dev; 244 + uint16_t ino_prefix; 245 + uint16_t qp_prefix; 246 + } QppEntry; 247 + 238 248 struct V9fsState 239 249 { 240 250 QLIST_HEAD(, V9fsPDU) free_list; ··· 257 267 V9fsConf fsconf; 258 268 V9fsQID root_qid; 259 269 dev_t dev_id; 270 + struct qht qpp_table; 271 + uint16_t qp_prefix_next; 260 272 }; 261 273 262 274 /* 9p2000.L open flags */
+24 -2
qemu-options.hx
··· 1339 1339 1340 1340 DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs, 1341 1341 "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n" 1342 - " [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" 1342 + " [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,multidevs=remap|forbid|warn]\n" 1343 1343 "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n" 1344 1344 "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n" 1345 1345 "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n", ··· 1347 1347 1348 1348 STEXI 1349 1349 1350 - @item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}] 1350 + @item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,multidevs=@var{multidevs}] 1351 1351 @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly] 1352 1352 @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly] 1353 1353 @itemx -virtfs synth,mount_tag=@var{mount_tag} ··· 1403 1403 only with security models "mapped-xattr" and "mapped-file". 1404 1404 @item mount_tag=@var{mount_tag} 1405 1405 Specifies the tag name to be used by the guest to mount this export point. 1406 + @item multidevs=@var{multidevs} 1407 + Specifies how to deal with multiple devices being shared with a 9p export. 1408 + Supported behaviours are either "remap", "forbid" or "warn". The latter is 1409 + the default behaviour on which virtfs 9p expects only one device to be 1410 + shared with the same export, and if more than one device is shared and 1411 + accessed via the same 9p export then only a warning message is logged 1412 + (once) by qemu on host side. In order to avoid file ID collisions on guest 1413 + you should either create a separate virtfs export for each device to be 1414 + shared with guests (recommended way) or you might use "remap" instead which 1415 + allows you to share multiple devices with only one export instead, which is 1416 + achieved by remapping the original inode numbers from host to guest in a 1417 + way that would prevent such collisions. Remapping inodes in such use cases 1418 + is required because the original device IDs from host are never passed and 1419 + exposed on guest. Instead all files of an export shared with virtfs always 1420 + share the same device id on guest. So two files with identical inode 1421 + numbers but from actually different devices on host would otherwise cause a 1422 + file ID collision and hence potential misbehaviours on guest. "forbid" on 1423 + the other hand assumes like "warn" that only one device is shared by the 1424 + same export, however it will not only log a warning message but also 1425 + deny access to additional devices on guest. Note though that "forbid" does 1426 + currently not block all possible file access operations (e.g. readdir() 1427 + would still return entries from other devices). 1406 1428 @end table 1407 1429 ETEXI 1408 1430
+6 -1
vl.c
··· 3335 3335 case QEMU_OPTION_virtfs: { 3336 3336 QemuOpts *fsdev; 3337 3337 QemuOpts *device; 3338 - const char *writeout, *sock_fd, *socket, *path, *security_model; 3338 + const char *writeout, *sock_fd, *socket, *path, *security_model, 3339 + *multidevs; 3339 3340 3340 3341 olist = qemu_find_opts("virtfs"); 3341 3342 if (!olist) { ··· 3395 3396 qemu_opt_set_bool(fsdev, "readonly", 3396 3397 qemu_opt_get_bool(opts, "readonly", 0), 3397 3398 &error_abort); 3399 + multidevs = qemu_opt_get(opts, "multidevs"); 3400 + if (multidevs) { 3401 + qemu_opt_set(fsdev, "multidevs", multidevs, &error_abort); 3402 + } 3398 3403 device = qemu_opts_create(qemu_find_opts("device"), NULL, 0, 3399 3404 &error_abort); 3400 3405 qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);