qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

virtiofsd: Fix xattr operations

Current virtiofsd has problems about xattr operations and
they does not work properly for directory/symlink/special file.

The fundamental cause is that virtiofsd uses openat() + f...xattr()
systemcalls for xattr operation but we should not open symlink/special
file in the daemon. Therefore the function is restricted.

Fix this problem by:
1. during setup of each thread, call unshare(CLONE_FS)
2. in xattr operations (i.e. lo_getxattr), if inode is not a regular
file or directory, use fchdir(proc_loot_fd) + ...xattr() +
fchdir(root.fd) instead of openat() + f...xattr()

(Note: for a regular file/directory openat() + f...xattr()
is still used for performance reason)

With this patch, xfstests generic/062 passes on virtiofs.

This fix is suggested by Miklos Szeredi and Stefan Hajnoczi.
The original discussion can be found here:
https://www.redhat.com/archives/virtio-fs/2019-October/msg00046.html

Signed-off-by: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Message-Id: <20200227055927.24566-3-misono.tomohiro@jp.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

authored by

Misono Tomohiro and committed by
Dr. David Alan Gilbert
bdfd6678 16e15a73

+77 -47
+13
tools/virtiofsd/fuse_virtio.c
··· 426 426 return ret; 427 427 } 428 428 429 + static __thread bool clone_fs_called; 430 + 429 431 /* Process one FVRequest in a thread pool */ 430 432 static void fv_queue_worker(gpointer data, gpointer user_data) 431 433 { ··· 440 442 struct fuse_bufvec *pbufv; 441 443 442 444 assert(se->bufsize > sizeof(struct fuse_in_header)); 445 + 446 + if (!clone_fs_called) { 447 + int ret; 448 + 449 + /* unshare FS for xattr operation */ 450 + ret = unshare(CLONE_FS); 451 + /* should not fail */ 452 + assert(ret == 0); 453 + 454 + clone_fs_called = true; 455 + } 443 456 444 457 /* 445 458 * An element contains one request and the space to send our response
+58 -47
tools/virtiofsd/passthrough_ll.c
··· 123 123 pthread_mutex_t plock_mutex; 124 124 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ 125 125 126 - bool is_symlink; 126 + mode_t filetype; 127 127 }; 128 128 129 129 struct lo_cred { ··· 695 695 struct lo_inode *parent; 696 696 char path[PATH_MAX]; 697 697 698 - if (inode->is_symlink) { 698 + if (S_ISLNK(inode->filetype)) { 699 699 res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); 700 700 if (res == -1 && errno == EINVAL) { 701 701 /* Sorry, no race free way to set times on symlink. */ ··· 928 928 goto out_err; 929 929 } 930 930 931 - inode->is_symlink = S_ISLNK(e->attr.st_mode); 931 + /* cache only filetype */ 932 + inode->filetype = (e->attr.st_mode & S_IFMT); 932 933 933 934 /* 934 935 * One for the caller and one for nlookup (released in ··· 1135 1136 struct lo_inode *parent; 1136 1137 char path[PATH_MAX]; 1137 1138 1138 - if (inode->is_symlink) { 1139 + if (S_ISLNK(inode->filetype)) { 1139 1140 res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); 1140 1141 if (res == -1 && (errno == ENOENT || errno == EINVAL)) { 1141 1142 /* Sorry, no race free way to hard-link a symlink. */ ··· 2189 2190 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", 2190 2191 ino, name, size); 2191 2192 2192 - if (inode->is_symlink) { 2193 - /* Sorry, no race free way to getxattr on symlink. */ 2194 - saverr = EPERM; 2195 - goto out; 2196 - } 2197 - 2198 2193 if (size) { 2199 2194 value = malloc(size); 2200 2195 if (!value) { ··· 2203 2198 } 2204 2199 2205 2200 sprintf(procname, "%i", inode->fd); 2206 - fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2207 - if (fd < 0) { 2208 - goto out_err; 2201 + /* 2202 + * It is not safe to open() non-regular/non-dir files in file server 2203 + * unless O_PATH is used, so use that method for regular files/dir 2204 + * only (as it seems giving less performance overhead). 2205 + * Otherwise, call fchdir() to avoid open(). 2206 + */ 2207 + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2208 + fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2209 + if (fd < 0) { 2210 + goto out_err; 2211 + } 2212 + ret = fgetxattr(fd, name, value, size); 2213 + } else { 2214 + /* fchdir should not fail here */ 2215 + assert(fchdir(lo->proc_self_fd) == 0); 2216 + ret = getxattr(procname, name, value, size); 2217 + assert(fchdir(lo->root.fd) == 0); 2209 2218 } 2210 2219 2211 - ret = fgetxattr(fd, name, value, size); 2212 2220 if (ret == -1) { 2213 2221 goto out_err; 2214 2222 } ··· 2262 2270 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, 2263 2271 size); 2264 2272 2265 - if (inode->is_symlink) { 2266 - /* Sorry, no race free way to listxattr on symlink. */ 2267 - saverr = EPERM; 2268 - goto out; 2269 - } 2270 - 2271 2273 if (size) { 2272 2274 value = malloc(size); 2273 2275 if (!value) { ··· 2276 2278 } 2277 2279 2278 2280 sprintf(procname, "%i", inode->fd); 2279 - fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2280 - if (fd < 0) { 2281 - goto out_err; 2281 + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2282 + fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2283 + if (fd < 0) { 2284 + goto out_err; 2285 + } 2286 + ret = flistxattr(fd, value, size); 2287 + } else { 2288 + /* fchdir should not fail here */ 2289 + assert(fchdir(lo->proc_self_fd) == 0); 2290 + ret = listxattr(procname, value, size); 2291 + assert(fchdir(lo->root.fd) == 0); 2282 2292 } 2283 2293 2284 - ret = flistxattr(fd, value, size); 2285 2294 if (ret == -1) { 2286 2295 goto out_err; 2287 2296 } ··· 2335 2344 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 2336 2345 ", name=%s value=%s size=%zd)\n", ino, name, value, size); 2337 2346 2338 - if (inode->is_symlink) { 2339 - /* Sorry, no race free way to setxattr on symlink. */ 2340 - saverr = EPERM; 2341 - goto out; 2342 - } 2343 - 2344 2347 sprintf(procname, "%i", inode->fd); 2345 - fd = openat(lo->proc_self_fd, procname, O_RDWR); 2346 - if (fd < 0) { 2347 - saverr = errno; 2348 - goto out; 2348 + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2349 + fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2350 + if (fd < 0) { 2351 + saverr = errno; 2352 + goto out; 2353 + } 2354 + ret = fsetxattr(fd, name, value, size, flags); 2355 + } else { 2356 + /* fchdir should not fail here */ 2357 + assert(fchdir(lo->proc_self_fd) == 0); 2358 + ret = setxattr(procname, name, value, size, flags); 2359 + assert(fchdir(lo->root.fd) == 0); 2349 2360 } 2350 2361 2351 - ret = fsetxattr(fd, name, value, size, flags); 2352 2362 saverr = ret == -1 ? errno : 0; 2353 2363 2354 2364 out: ··· 2383 2393 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, 2384 2394 name); 2385 2395 2386 - if (inode->is_symlink) { 2387 - /* Sorry, no race free way to setxattr on symlink. */ 2388 - saverr = EPERM; 2389 - goto out; 2390 - } 2391 - 2392 2396 sprintf(procname, "%i", inode->fd); 2393 - fd = openat(lo->proc_self_fd, procname, O_RDWR); 2394 - if (fd < 0) { 2395 - saverr = errno; 2396 - goto out; 2397 + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2398 + fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2399 + if (fd < 0) { 2400 + saverr = errno; 2401 + goto out; 2402 + } 2403 + ret = fremovexattr(fd, name); 2404 + } else { 2405 + /* fchdir should not fail here */ 2406 + assert(fchdir(lo->proc_self_fd) == 0); 2407 + ret = removexattr(procname, name); 2408 + assert(fchdir(lo->root.fd) == 0); 2397 2409 } 2398 2410 2399 - ret = fremovexattr(fd, name); 2400 2411 saverr = ret == -1 ? errno : 0; 2401 2412 2402 2413 out: ··· 2796 2807 exit(1); 2797 2808 } 2798 2809 2799 - root->is_symlink = false; 2810 + root->filetype = S_IFDIR; 2800 2811 root->fd = fd; 2801 2812 root->key.ino = stat.st_ino; 2802 2813 root->key.dev = stat.st_dev;
+6
tools/virtiofsd/seccomp.c
··· 41 41 SCMP_SYS(exit), 42 42 SCMP_SYS(exit_group), 43 43 SCMP_SYS(fallocate), 44 + SCMP_SYS(fchdir), 44 45 SCMP_SYS(fchmodat), 45 46 SCMP_SYS(fchownat), 46 47 SCMP_SYS(fcntl), ··· 62 63 SCMP_SYS(getpid), 63 64 SCMP_SYS(gettid), 64 65 SCMP_SYS(gettimeofday), 66 + SCMP_SYS(getxattr), 65 67 SCMP_SYS(linkat), 68 + SCMP_SYS(listxattr), 66 69 SCMP_SYS(lseek), 67 70 SCMP_SYS(madvise), 68 71 SCMP_SYS(mkdirat), ··· 85 88 SCMP_SYS(recvmsg), 86 89 SCMP_SYS(renameat), 87 90 SCMP_SYS(renameat2), 91 + SCMP_SYS(removexattr), 88 92 SCMP_SYS(rt_sigaction), 89 93 SCMP_SYS(rt_sigprocmask), 90 94 SCMP_SYS(rt_sigreturn), ··· 98 102 SCMP_SYS(setresuid32), 99 103 #endif 100 104 SCMP_SYS(set_robust_list), 105 + SCMP_SYS(setxattr), 101 106 SCMP_SYS(symlinkat), 102 107 SCMP_SYS(time), /* Rarely needed, except on static builds */ 103 108 SCMP_SYS(tgkill), 104 109 SCMP_SYS(unlinkat), 110 + SCMP_SYS(unshare), 105 111 SCMP_SYS(utimensat), 106 112 SCMP_SYS(write), 107 113 SCMP_SYS(writev),