qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio
at master 2969 lines 74 kB view raw
1/* 2 * FUSE: Filesystem in Userspace 3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> 4 * 5 * This program can be distributed under the terms of the GNU GPLv2. 6 * See the file COPYING. 7 */ 8 9/* 10 * 11 * This file system mirrors the existing file system hierarchy of the 12 * system, starting at the root file system. This is implemented by 13 * just "passing through" all requests to the corresponding user-space 14 * libc functions. In contrast to passthrough.c and passthrough_fh.c, 15 * this implementation uses the low-level API. Its performance should 16 * be the least bad among the three, but many operations are not 17 * implemented. In particular, it is not possible to remove files (or 18 * directories) because the code necessary to defer actual removal 19 * until the file is not opened anymore would make the example much 20 * more complicated. 21 * 22 * When writeback caching is enabled (-o writeback mount option), it 23 * is only possible to write to files for which the mounting user has 24 * read permissions. This is because the writeback cache requires the 25 * kernel to be able to issue read requests for all files (which the 26 * passthrough filesystem cannot satisfy if it can't read the file in 27 * the underlying filesystem). 28 * 29 * Compile with: 30 * 31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o 32 * passthrough_ll 33 * 34 * ## Source code ## 35 * \include passthrough_ll.c 36 */ 37 38#include "qemu/osdep.h" 39#include "qemu/timer.h" 40#include "fuse_virtio.h" 41#include "fuse_log.h" 42#include "fuse_lowlevel.h" 43#include <assert.h> 44#include <cap-ng.h> 45#include <dirent.h> 46#include <errno.h> 47#include <glib.h> 48#include <inttypes.h> 49#include <limits.h> 50#include <pthread.h> 51#include <stdbool.h> 52#include <stddef.h> 53#include <stdio.h> 54#include <stdlib.h> 55#include <string.h> 56#include <sys/file.h> 57#include <sys/mount.h> 58#include <sys/prctl.h> 59#include <sys/resource.h> 60#include <sys/syscall.h> 61#include <sys/types.h> 62#include <sys/wait.h> 63#include <sys/xattr.h> 64#include <syslog.h> 65#include <unistd.h> 66 67#include "passthrough_helpers.h" 68#include "seccomp.h" 69 70/* Keep track of inode posix locks for each owner. */ 71struct lo_inode_plock { 72 uint64_t lock_owner; 73 int fd; /* fd for OFD locks */ 74}; 75 76struct lo_map_elem { 77 union { 78 struct lo_inode *inode; 79 struct lo_dirp *dirp; 80 int fd; 81 ssize_t freelist; 82 }; 83 bool in_use; 84}; 85 86/* Maps FUSE fh or ino values to internal objects */ 87struct lo_map { 88 struct lo_map_elem *elems; 89 size_t nelems; 90 ssize_t freelist; 91}; 92 93struct lo_key { 94 ino_t ino; 95 dev_t dev; 96}; 97 98struct lo_inode { 99 int fd; 100 101 /* 102 * Atomic reference count for this object. The nlookup field holds a 103 * reference and release it when nlookup reaches 0. 104 */ 105 gint refcount; 106 107 struct lo_key key; 108 109 /* 110 * This counter keeps the inode alive during the FUSE session. 111 * Incremented when the FUSE inode number is sent in a reply 112 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is 113 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc. 114 * 115 * Note that this value is untrusted because the client can manipulate 116 * it arbitrarily using FUSE_FORGET requests. 117 * 118 * Protected by lo->mutex. 119 */ 120 uint64_t nlookup; 121 122 fuse_ino_t fuse_ino; 123 pthread_mutex_t plock_mutex; 124 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ 125 126 mode_t filetype; 127}; 128 129struct lo_cred { 130 uid_t euid; 131 gid_t egid; 132}; 133 134enum { 135 CACHE_NONE, 136 CACHE_AUTO, 137 CACHE_ALWAYS, 138}; 139 140struct lo_data { 141 pthread_mutex_t mutex; 142 int debug; 143 int writeback; 144 int flock; 145 int posix_lock; 146 int xattr; 147 char *source; 148 char *modcaps; 149 double timeout; 150 int cache; 151 int timeout_set; 152 int readdirplus_set; 153 int readdirplus_clear; 154 struct lo_inode root; 155 GHashTable *inodes; /* protected by lo->mutex */ 156 struct lo_map ino_map; /* protected by lo->mutex */ 157 struct lo_map dirp_map; /* protected by lo->mutex */ 158 struct lo_map fd_map; /* protected by lo->mutex */ 159 160 /* An O_PATH file descriptor to /proc/self/fd/ */ 161 int proc_self_fd; 162}; 163 164static const struct fuse_opt lo_opts[] = { 165 { "writeback", offsetof(struct lo_data, writeback), 1 }, 166 { "no_writeback", offsetof(struct lo_data, writeback), 0 }, 167 { "source=%s", offsetof(struct lo_data, source), 0 }, 168 { "flock", offsetof(struct lo_data, flock), 1 }, 169 { "no_flock", offsetof(struct lo_data, flock), 0 }, 170 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, 171 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, 172 { "xattr", offsetof(struct lo_data, xattr), 1 }, 173 { "no_xattr", offsetof(struct lo_data, xattr), 0 }, 174 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 }, 175 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, 176 { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, 177 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, 178 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, 179 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, 180 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, 181 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, 182 FUSE_OPT_END 183}; 184static bool use_syslog = false; 185static int current_log_level; 186static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, 187 uint64_t n); 188 189static struct { 190 pthread_mutex_t mutex; 191 void *saved; 192} cap; 193/* That we loaded cap-ng in the current thread from the saved */ 194static __thread bool cap_loaded = 0; 195 196static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); 197 198static int is_dot_or_dotdot(const char *name) 199{ 200 return name[0] == '.' && 201 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); 202} 203 204/* Is `path` a single path component that is not "." or ".."? */ 205static int is_safe_path_component(const char *path) 206{ 207 if (strchr(path, '/')) { 208 return 0; 209 } 210 211 return !is_dot_or_dotdot(path); 212} 213 214static struct lo_data *lo_data(fuse_req_t req) 215{ 216 return (struct lo_data *)fuse_req_userdata(req); 217} 218 219/* 220 * Load capng's state from our saved state if the current thread 221 * hadn't previously been loaded. 222 * returns 0 on success 223 */ 224static int load_capng(void) 225{ 226 if (!cap_loaded) { 227 pthread_mutex_lock(&cap.mutex); 228 capng_restore_state(&cap.saved); 229 /* 230 * restore_state free's the saved copy 231 * so make another. 232 */ 233 cap.saved = capng_save_state(); 234 if (!cap.saved) { 235 pthread_mutex_unlock(&cap.mutex); 236 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); 237 return -EINVAL; 238 } 239 pthread_mutex_unlock(&cap.mutex); 240 241 /* 242 * We want to use the loaded state for our pid, 243 * not the original 244 */ 245 capng_setpid(syscall(SYS_gettid)); 246 cap_loaded = true; 247 } 248 return 0; 249} 250 251/* 252 * Helpers for dropping and regaining effective capabilities. Returns 0 253 * on success, error otherwise 254 */ 255static int drop_effective_cap(const char *cap_name, bool *cap_dropped) 256{ 257 int cap, ret; 258 259 cap = capng_name_to_capability(cap_name); 260 if (cap < 0) { 261 ret = errno; 262 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", 263 cap_name, strerror(errno)); 264 goto out; 265 } 266 267 if (load_capng()) { 268 ret = errno; 269 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); 270 goto out; 271 } 272 273 /* We dont have this capability in effective set already. */ 274 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { 275 ret = 0; 276 goto out; 277 } 278 279 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { 280 ret = errno; 281 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); 282 goto out; 283 } 284 285 if (capng_apply(CAPNG_SELECT_CAPS)) { 286 ret = errno; 287 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); 288 goto out; 289 } 290 291 ret = 0; 292 if (cap_dropped) { 293 *cap_dropped = true; 294 } 295 296out: 297 return ret; 298} 299 300static int gain_effective_cap(const char *cap_name) 301{ 302 int cap; 303 int ret = 0; 304 305 cap = capng_name_to_capability(cap_name); 306 if (cap < 0) { 307 ret = errno; 308 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", 309 cap_name, strerror(errno)); 310 goto out; 311 } 312 313 if (load_capng()) { 314 ret = errno; 315 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); 316 goto out; 317 } 318 319 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { 320 ret = errno; 321 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); 322 goto out; 323 } 324 325 if (capng_apply(CAPNG_SELECT_CAPS)) { 326 ret = errno; 327 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); 328 goto out; 329 } 330 ret = 0; 331 332out: 333 return ret; 334} 335 336static void lo_map_init(struct lo_map *map) 337{ 338 map->elems = NULL; 339 map->nelems = 0; 340 map->freelist = -1; 341} 342 343static void lo_map_destroy(struct lo_map *map) 344{ 345 free(map->elems); 346} 347 348static int lo_map_grow(struct lo_map *map, size_t new_nelems) 349{ 350 struct lo_map_elem *new_elems; 351 size_t i; 352 353 if (new_nelems <= map->nelems) { 354 return 1; 355 } 356 357 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems); 358 if (!new_elems) { 359 return 0; 360 } 361 362 for (i = map->nelems; i < new_nelems; i++) { 363 new_elems[i].freelist = i + 1; 364 new_elems[i].in_use = false; 365 } 366 new_elems[new_nelems - 1].freelist = -1; 367 368 map->elems = new_elems; 369 map->freelist = map->nelems; 370 map->nelems = new_nelems; 371 return 1; 372} 373 374static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) 375{ 376 struct lo_map_elem *elem; 377 378 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { 379 return NULL; 380 } 381 382 elem = &map->elems[map->freelist]; 383 map->freelist = elem->freelist; 384 385 elem->in_use = true; 386 387 return elem; 388} 389 390static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) 391{ 392 ssize_t *prev; 393 394 if (!lo_map_grow(map, key + 1)) { 395 return NULL; 396 } 397 398 for (prev = &map->freelist; *prev != -1; 399 prev = &map->elems[*prev].freelist) { 400 if (*prev == key) { 401 struct lo_map_elem *elem = &map->elems[key]; 402 403 *prev = elem->freelist; 404 elem->in_use = true; 405 return elem; 406 } 407 } 408 return NULL; 409} 410 411static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) 412{ 413 if (key >= map->nelems) { 414 return NULL; 415 } 416 if (!map->elems[key].in_use) { 417 return NULL; 418 } 419 return &map->elems[key]; 420} 421 422static void lo_map_remove(struct lo_map *map, size_t key) 423{ 424 struct lo_map_elem *elem; 425 426 if (key >= map->nelems) { 427 return; 428 } 429 430 elem = &map->elems[key]; 431 if (!elem->in_use) { 432 return; 433 } 434 435 elem->in_use = false; 436 437 elem->freelist = map->freelist; 438 map->freelist = key; 439} 440 441/* Assumes lo->mutex is held */ 442static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) 443{ 444 struct lo_map_elem *elem; 445 446 elem = lo_map_alloc_elem(&lo_data(req)->fd_map); 447 if (!elem) { 448 return -1; 449 } 450 451 elem->fd = fd; 452 return elem - lo_data(req)->fd_map.elems; 453} 454 455/* Assumes lo->mutex is held */ 456static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) 457{ 458 struct lo_map_elem *elem; 459 460 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); 461 if (!elem) { 462 return -1; 463 } 464 465 elem->dirp = dirp; 466 return elem - lo_data(req)->dirp_map.elems; 467} 468 469/* Assumes lo->mutex is held */ 470static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) 471{ 472 struct lo_map_elem *elem; 473 474 elem = lo_map_alloc_elem(&lo_data(req)->ino_map); 475 if (!elem) { 476 return -1; 477 } 478 479 elem->inode = inode; 480 return elem - lo_data(req)->ino_map.elems; 481} 482 483static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) 484{ 485 struct lo_inode *inode = *inodep; 486 487 if (!inode) { 488 return; 489 } 490 491 *inodep = NULL; 492 493 if (g_atomic_int_dec_and_test(&inode->refcount)) { 494 close(inode->fd); 495 free(inode); 496 } 497} 498 499/* Caller must release refcount using lo_inode_put() */ 500static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) 501{ 502 struct lo_data *lo = lo_data(req); 503 struct lo_map_elem *elem; 504 505 pthread_mutex_lock(&lo->mutex); 506 elem = lo_map_get(&lo->ino_map, ino); 507 if (elem) { 508 g_atomic_int_inc(&elem->inode->refcount); 509 } 510 pthread_mutex_unlock(&lo->mutex); 511 512 if (!elem) { 513 return NULL; 514 } 515 516 return elem->inode; 517} 518 519/* 520 * TODO Remove this helper and force callers to hold an inode refcount until 521 * they are done with the fd. This will be done in a later patch to make 522 * review easier. 523 */ 524static int lo_fd(fuse_req_t req, fuse_ino_t ino) 525{ 526 struct lo_inode *inode = lo_inode(req, ino); 527 int fd; 528 529 if (!inode) { 530 return -1; 531 } 532 533 fd = inode->fd; 534 lo_inode_put(lo_data(req), &inode); 535 return fd; 536} 537 538static void lo_init(void *userdata, struct fuse_conn_info *conn) 539{ 540 struct lo_data *lo = (struct lo_data *)userdata; 541 542 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { 543 conn->want |= FUSE_CAP_EXPORT_SUPPORT; 544 } 545 546 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { 547 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); 548 conn->want |= FUSE_CAP_WRITEBACK_CACHE; 549 } 550 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { 551 if (lo->flock) { 552 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); 553 conn->want |= FUSE_CAP_FLOCK_LOCKS; 554 } else { 555 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); 556 conn->want &= ~FUSE_CAP_FLOCK_LOCKS; 557 } 558 } 559 560 if (conn->capable & FUSE_CAP_POSIX_LOCKS) { 561 if (lo->posix_lock) { 562 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); 563 conn->want |= FUSE_CAP_POSIX_LOCKS; 564 } else { 565 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); 566 conn->want &= ~FUSE_CAP_POSIX_LOCKS; 567 } 568 } 569 570 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || 571 lo->readdirplus_clear) { 572 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); 573 conn->want &= ~FUSE_CAP_READDIRPLUS; 574 } 575} 576 577static void lo_getattr(fuse_req_t req, fuse_ino_t ino, 578 struct fuse_file_info *fi) 579{ 580 int res; 581 struct stat buf; 582 struct lo_data *lo = lo_data(req); 583 584 (void)fi; 585 586 res = 587 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 588 if (res == -1) { 589 return (void)fuse_reply_err(req, errno); 590 } 591 592 fuse_reply_attr(req, &buf, lo->timeout); 593} 594 595static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) 596{ 597 struct lo_data *lo = lo_data(req); 598 struct lo_map_elem *elem; 599 600 pthread_mutex_lock(&lo->mutex); 601 elem = lo_map_get(&lo->fd_map, fi->fh); 602 pthread_mutex_unlock(&lo->mutex); 603 604 if (!elem) { 605 return -1; 606 } 607 608 return elem->fd; 609} 610 611static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, 612 int valid, struct fuse_file_info *fi) 613{ 614 int saverr; 615 char procname[64]; 616 struct lo_data *lo = lo_data(req); 617 struct lo_inode *inode; 618 int ifd; 619 int res; 620 int fd; 621 622 inode = lo_inode(req, ino); 623 if (!inode) { 624 fuse_reply_err(req, EBADF); 625 return; 626 } 627 628 ifd = inode->fd; 629 630 /* If fi->fh is invalid we'll report EBADF later */ 631 if (fi) { 632 fd = lo_fi_fd(req, fi); 633 } 634 635 if (valid & FUSE_SET_ATTR_MODE) { 636 if (fi) { 637 res = fchmod(fd, attr->st_mode); 638 } else { 639 sprintf(procname, "%i", ifd); 640 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); 641 } 642 if (res == -1) { 643 goto out_err; 644 } 645 } 646 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { 647 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; 648 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; 649 650 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 651 if (res == -1) { 652 goto out_err; 653 } 654 } 655 if (valid & FUSE_SET_ATTR_SIZE) { 656 int truncfd; 657 658 if (fi) { 659 truncfd = fd; 660 } else { 661 sprintf(procname, "%i", ifd); 662 truncfd = openat(lo->proc_self_fd, procname, O_RDWR); 663 if (truncfd < 0) { 664 goto out_err; 665 } 666 } 667 668 res = ftruncate(truncfd, attr->st_size); 669 if (!fi) { 670 saverr = errno; 671 close(truncfd); 672 errno = saverr; 673 } 674 if (res == -1) { 675 goto out_err; 676 } 677 } 678 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { 679 struct timespec tv[2]; 680 681 tv[0].tv_sec = 0; 682 tv[1].tv_sec = 0; 683 tv[0].tv_nsec = UTIME_OMIT; 684 tv[1].tv_nsec = UTIME_OMIT; 685 686 if (valid & FUSE_SET_ATTR_ATIME_NOW) { 687 tv[0].tv_nsec = UTIME_NOW; 688 } else if (valid & FUSE_SET_ATTR_ATIME) { 689 tv[0] = attr->st_atim; 690 } 691 692 if (valid & FUSE_SET_ATTR_MTIME_NOW) { 693 tv[1].tv_nsec = UTIME_NOW; 694 } else if (valid & FUSE_SET_ATTR_MTIME) { 695 tv[1] = attr->st_mtim; 696 } 697 698 if (fi) { 699 res = futimens(fd, tv); 700 } else { 701 sprintf(procname, "%i", inode->fd); 702 res = utimensat(lo->proc_self_fd, procname, tv, 0); 703 } 704 if (res == -1) { 705 goto out_err; 706 } 707 } 708 lo_inode_put(lo, &inode); 709 710 return lo_getattr(req, ino, fi); 711 712out_err: 713 saverr = errno; 714 lo_inode_put(lo, &inode); 715 fuse_reply_err(req, saverr); 716} 717 718static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) 719{ 720 struct lo_inode *p; 721 struct lo_key key = { 722 .ino = st->st_ino, 723 .dev = st->st_dev, 724 }; 725 726 pthread_mutex_lock(&lo->mutex); 727 p = g_hash_table_lookup(lo->inodes, &key); 728 if (p) { 729 assert(p->nlookup > 0); 730 p->nlookup++; 731 g_atomic_int_inc(&p->refcount); 732 } 733 pthread_mutex_unlock(&lo->mutex); 734 735 return p; 736} 737 738/* value_destroy_func for posix_locks GHashTable */ 739static void posix_locks_value_destroy(gpointer data) 740{ 741 struct lo_inode_plock *plock = data; 742 743 /* 744 * We had used open() for locks and had only one fd. So 745 * closing this fd should release all OFD locks. 746 */ 747 close(plock->fd); 748 free(plock); 749} 750 751/* 752 * Increments nlookup and caller must release refcount using 753 * lo_inode_put(&parent). 754 */ 755static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, 756 struct fuse_entry_param *e) 757{ 758 int newfd; 759 int res; 760 int saverr; 761 struct lo_data *lo = lo_data(req); 762 struct lo_inode *inode = NULL; 763 struct lo_inode *dir = lo_inode(req, parent); 764 765 /* 766 * name_to_handle_at() and open_by_handle_at() can reach here with fuse 767 * mount point in guest, but we don't have its inode info in the 768 * ino_map. 769 */ 770 if (!dir) { 771 return ENOENT; 772 } 773 774 memset(e, 0, sizeof(*e)); 775 e->attr_timeout = lo->timeout; 776 e->entry_timeout = lo->timeout; 777 778 /* Do not allow escaping root directory */ 779 if (dir == &lo->root && strcmp(name, "..") == 0) { 780 name = "."; 781 } 782 783 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); 784 if (newfd == -1) { 785 goto out_err; 786 } 787 788 res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 789 if (res == -1) { 790 goto out_err; 791 } 792 793 inode = lo_find(lo, &e->attr); 794 if (inode) { 795 close(newfd); 796 } else { 797 inode = calloc(1, sizeof(struct lo_inode)); 798 if (!inode) { 799 goto out_err; 800 } 801 802 /* cache only filetype */ 803 inode->filetype = (e->attr.st_mode & S_IFMT); 804 805 /* 806 * One for the caller and one for nlookup (released in 807 * unref_inode_lolocked()) 808 */ 809 g_atomic_int_set(&inode->refcount, 2); 810 811 inode->nlookup = 1; 812 inode->fd = newfd; 813 inode->key.ino = e->attr.st_ino; 814 inode->key.dev = e->attr.st_dev; 815 pthread_mutex_init(&inode->plock_mutex, NULL); 816 inode->posix_locks = g_hash_table_new_full( 817 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); 818 819 pthread_mutex_lock(&lo->mutex); 820 inode->fuse_ino = lo_add_inode_mapping(req, inode); 821 g_hash_table_insert(lo->inodes, &inode->key, inode); 822 pthread_mutex_unlock(&lo->mutex); 823 } 824 e->ino = inode->fuse_ino; 825 lo_inode_put(lo, &inode); 826 lo_inode_put(lo, &dir); 827 828 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 829 name, (unsigned long long)e->ino); 830 831 return 0; 832 833out_err: 834 saverr = errno; 835 if (newfd != -1) { 836 close(newfd); 837 } 838 lo_inode_put(lo, &inode); 839 lo_inode_put(lo, &dir); 840 return saverr; 841} 842 843static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) 844{ 845 struct fuse_entry_param e; 846 int err; 847 848 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, 849 name); 850 851 /* 852 * Don't use is_safe_path_component(), allow "." and ".." for NFS export 853 * support. 854 */ 855 if (strchr(name, '/')) { 856 fuse_reply_err(req, EINVAL); 857 return; 858 } 859 860 err = lo_do_lookup(req, parent, name, &e); 861 if (err) { 862 fuse_reply_err(req, err); 863 } else { 864 fuse_reply_entry(req, &e); 865 } 866} 867 868/* 869 * On some archs, setres*id is limited to 2^16 but they 870 * provide setres*id32 variants that allow 2^32. 871 * Others just let setres*id do 2^32 anyway. 872 */ 873#ifdef SYS_setresgid32 874#define OURSYS_setresgid SYS_setresgid32 875#else 876#define OURSYS_setresgid SYS_setresgid 877#endif 878 879#ifdef SYS_setresuid32 880#define OURSYS_setresuid SYS_setresuid32 881#else 882#define OURSYS_setresuid SYS_setresuid 883#endif 884 885/* 886 * Change to uid/gid of caller so that file is created with 887 * ownership of caller. 888 * TODO: What about selinux context? 889 */ 890static int lo_change_cred(fuse_req_t req, struct lo_cred *old) 891{ 892 int res; 893 894 old->euid = geteuid(); 895 old->egid = getegid(); 896 897 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); 898 if (res == -1) { 899 return errno; 900 } 901 902 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); 903 if (res == -1) { 904 int errno_save = errno; 905 906 syscall(OURSYS_setresgid, -1, old->egid, -1); 907 return errno_save; 908 } 909 910 return 0; 911} 912 913/* Regain Privileges */ 914static void lo_restore_cred(struct lo_cred *old) 915{ 916 int res; 917 918 res = syscall(OURSYS_setresuid, -1, old->euid, -1); 919 if (res == -1) { 920 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); 921 exit(1); 922 } 923 924 res = syscall(OURSYS_setresgid, -1, old->egid, -1); 925 if (res == -1) { 926 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); 927 exit(1); 928 } 929} 930 931static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, 932 const char *name, mode_t mode, dev_t rdev, 933 const char *link) 934{ 935 int res; 936 int saverr; 937 struct lo_data *lo = lo_data(req); 938 struct lo_inode *dir; 939 struct fuse_entry_param e; 940 struct lo_cred old = {}; 941 942 if (!is_safe_path_component(name)) { 943 fuse_reply_err(req, EINVAL); 944 return; 945 } 946 947 dir = lo_inode(req, parent); 948 if (!dir) { 949 fuse_reply_err(req, EBADF); 950 return; 951 } 952 953 saverr = lo_change_cred(req, &old); 954 if (saverr) { 955 goto out; 956 } 957 958 res = mknod_wrapper(dir->fd, name, link, mode, rdev); 959 960 saverr = errno; 961 962 lo_restore_cred(&old); 963 964 if (res == -1) { 965 goto out; 966 } 967 968 saverr = lo_do_lookup(req, parent, name, &e); 969 if (saverr) { 970 goto out; 971 } 972 973 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 974 name, (unsigned long long)e.ino); 975 976 fuse_reply_entry(req, &e); 977 lo_inode_put(lo, &dir); 978 return; 979 980out: 981 lo_inode_put(lo, &dir); 982 fuse_reply_err(req, saverr); 983} 984 985static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, 986 mode_t mode, dev_t rdev) 987{ 988 lo_mknod_symlink(req, parent, name, mode, rdev, NULL); 989} 990 991static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, 992 mode_t mode) 993{ 994 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); 995} 996 997static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, 998 const char *name) 999{ 1000 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); 1001} 1002 1003static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, 1004 const char *name) 1005{ 1006 int res; 1007 struct lo_data *lo = lo_data(req); 1008 struct lo_inode *parent_inode; 1009 struct lo_inode *inode; 1010 struct fuse_entry_param e; 1011 char procname[64]; 1012 int saverr; 1013 1014 if (!is_safe_path_component(name)) { 1015 fuse_reply_err(req, EINVAL); 1016 return; 1017 } 1018 1019 parent_inode = lo_inode(req, parent); 1020 inode = lo_inode(req, ino); 1021 if (!parent_inode || !inode) { 1022 errno = EBADF; 1023 goto out_err; 1024 } 1025 1026 memset(&e, 0, sizeof(struct fuse_entry_param)); 1027 e.attr_timeout = lo->timeout; 1028 e.entry_timeout = lo->timeout; 1029 1030 sprintf(procname, "%i", inode->fd); 1031 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name, 1032 AT_SYMLINK_FOLLOW); 1033 if (res == -1) { 1034 goto out_err; 1035 } 1036 1037 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 1038 if (res == -1) { 1039 goto out_err; 1040 } 1041 1042 pthread_mutex_lock(&lo->mutex); 1043 inode->nlookup++; 1044 pthread_mutex_unlock(&lo->mutex); 1045 e.ino = inode->fuse_ino; 1046 1047 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 1048 name, (unsigned long long)e.ino); 1049 1050 fuse_reply_entry(req, &e); 1051 lo_inode_put(lo, &parent_inode); 1052 lo_inode_put(lo, &inode); 1053 return; 1054 1055out_err: 1056 saverr = errno; 1057 lo_inode_put(lo, &parent_inode); 1058 lo_inode_put(lo, &inode); 1059 fuse_reply_err(req, saverr); 1060} 1061 1062/* Increments nlookup and caller must release refcount using lo_inode_put() */ 1063static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, 1064 const char *name) 1065{ 1066 int res; 1067 struct stat attr; 1068 1069 res = fstatat(lo_fd(req, parent), name, &attr, 1070 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 1071 if (res == -1) { 1072 return NULL; 1073 } 1074 1075 return lo_find(lo_data(req), &attr); 1076} 1077 1078static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) 1079{ 1080 int res; 1081 struct lo_inode *inode; 1082 struct lo_data *lo = lo_data(req); 1083 1084 if (!is_safe_path_component(name)) { 1085 fuse_reply_err(req, EINVAL); 1086 return; 1087 } 1088 1089 inode = lookup_name(req, parent, name); 1090 if (!inode) { 1091 fuse_reply_err(req, EIO); 1092 return; 1093 } 1094 1095 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); 1096 1097 fuse_reply_err(req, res == -1 ? errno : 0); 1098 unref_inode_lolocked(lo, inode, 1); 1099 lo_inode_put(lo, &inode); 1100} 1101 1102static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, 1103 fuse_ino_t newparent, const char *newname, 1104 unsigned int flags) 1105{ 1106 int res; 1107 struct lo_inode *parent_inode; 1108 struct lo_inode *newparent_inode; 1109 struct lo_inode *oldinode = NULL; 1110 struct lo_inode *newinode = NULL; 1111 struct lo_data *lo = lo_data(req); 1112 1113 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { 1114 fuse_reply_err(req, EINVAL); 1115 return; 1116 } 1117 1118 parent_inode = lo_inode(req, parent); 1119 newparent_inode = lo_inode(req, newparent); 1120 if (!parent_inode || !newparent_inode) { 1121 fuse_reply_err(req, EBADF); 1122 goto out; 1123 } 1124 1125 oldinode = lookup_name(req, parent, name); 1126 newinode = lookup_name(req, newparent, newname); 1127 1128 if (!oldinode) { 1129 fuse_reply_err(req, EIO); 1130 goto out; 1131 } 1132 1133 if (flags) { 1134#ifndef SYS_renameat2 1135 fuse_reply_err(req, EINVAL); 1136#else 1137 res = syscall(SYS_renameat2, parent_inode->fd, name, 1138 newparent_inode->fd, newname, flags); 1139 if (res == -1 && errno == ENOSYS) { 1140 fuse_reply_err(req, EINVAL); 1141 } else { 1142 fuse_reply_err(req, res == -1 ? errno : 0); 1143 } 1144#endif 1145 goto out; 1146 } 1147 1148 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); 1149 1150 fuse_reply_err(req, res == -1 ? errno : 0); 1151out: 1152 unref_inode_lolocked(lo, oldinode, 1); 1153 unref_inode_lolocked(lo, newinode, 1); 1154 lo_inode_put(lo, &oldinode); 1155 lo_inode_put(lo, &newinode); 1156 lo_inode_put(lo, &parent_inode); 1157 lo_inode_put(lo, &newparent_inode); 1158} 1159 1160static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) 1161{ 1162 int res; 1163 struct lo_inode *inode; 1164 struct lo_data *lo = lo_data(req); 1165 1166 if (!is_safe_path_component(name)) { 1167 fuse_reply_err(req, EINVAL); 1168 return; 1169 } 1170 1171 inode = lookup_name(req, parent, name); 1172 if (!inode) { 1173 fuse_reply_err(req, EIO); 1174 return; 1175 } 1176 1177 res = unlinkat(lo_fd(req, parent), name, 0); 1178 1179 fuse_reply_err(req, res == -1 ? errno : 0); 1180 unref_inode_lolocked(lo, inode, 1); 1181 lo_inode_put(lo, &inode); 1182} 1183 1184/* To be called with lo->mutex held */ 1185static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) 1186{ 1187 if (!inode) { 1188 return; 1189 } 1190 1191 assert(inode->nlookup >= n); 1192 inode->nlookup -= n; 1193 if (!inode->nlookup) { 1194 lo_map_remove(&lo->ino_map, inode->fuse_ino); 1195 g_hash_table_remove(lo->inodes, &inode->key); 1196 if (g_hash_table_size(inode->posix_locks)) { 1197 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); 1198 } 1199 g_hash_table_destroy(inode->posix_locks); 1200 pthread_mutex_destroy(&inode->plock_mutex); 1201 1202 /* Drop our refcount from lo_do_lookup() */ 1203 lo_inode_put(lo, &inode); 1204 } 1205} 1206 1207static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, 1208 uint64_t n) 1209{ 1210 if (!inode) { 1211 return; 1212 } 1213 1214 pthread_mutex_lock(&lo->mutex); 1215 unref_inode(lo, inode, n); 1216 pthread_mutex_unlock(&lo->mutex); 1217} 1218 1219static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) 1220{ 1221 struct lo_data *lo = lo_data(req); 1222 struct lo_inode *inode; 1223 1224 inode = lo_inode(req, ino); 1225 if (!inode) { 1226 return; 1227 } 1228 1229 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", 1230 (unsigned long long)ino, (unsigned long long)inode->nlookup, 1231 (unsigned long long)nlookup); 1232 1233 unref_inode_lolocked(lo, inode, nlookup); 1234 lo_inode_put(lo, &inode); 1235} 1236 1237static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) 1238{ 1239 lo_forget_one(req, ino, nlookup); 1240 fuse_reply_none(req); 1241} 1242 1243static void lo_forget_multi(fuse_req_t req, size_t count, 1244 struct fuse_forget_data *forgets) 1245{ 1246 int i; 1247 1248 for (i = 0; i < count; i++) { 1249 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); 1250 } 1251 fuse_reply_none(req); 1252} 1253 1254static void lo_readlink(fuse_req_t req, fuse_ino_t ino) 1255{ 1256 char buf[PATH_MAX + 1]; 1257 int res; 1258 1259 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); 1260 if (res == -1) { 1261 return (void)fuse_reply_err(req, errno); 1262 } 1263 1264 if (res == sizeof(buf)) { 1265 return (void)fuse_reply_err(req, ENAMETOOLONG); 1266 } 1267 1268 buf[res] = '\0'; 1269 1270 fuse_reply_readlink(req, buf); 1271} 1272 1273struct lo_dirp { 1274 gint refcount; 1275 DIR *dp; 1276 struct dirent *entry; 1277 off_t offset; 1278}; 1279 1280static void lo_dirp_put(struct lo_dirp **dp) 1281{ 1282 struct lo_dirp *d = *dp; 1283 1284 if (!d) { 1285 return; 1286 } 1287 *dp = NULL; 1288 1289 if (g_atomic_int_dec_and_test(&d->refcount)) { 1290 closedir(d->dp); 1291 free(d); 1292 } 1293} 1294 1295/* Call lo_dirp_put() on the return value when no longer needed */ 1296static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) 1297{ 1298 struct lo_data *lo = lo_data(req); 1299 struct lo_map_elem *elem; 1300 1301 pthread_mutex_lock(&lo->mutex); 1302 elem = lo_map_get(&lo->dirp_map, fi->fh); 1303 if (elem) { 1304 g_atomic_int_inc(&elem->dirp->refcount); 1305 } 1306 pthread_mutex_unlock(&lo->mutex); 1307 if (!elem) { 1308 return NULL; 1309 } 1310 1311 return elem->dirp; 1312} 1313 1314static void lo_opendir(fuse_req_t req, fuse_ino_t ino, 1315 struct fuse_file_info *fi) 1316{ 1317 int error = ENOMEM; 1318 struct lo_data *lo = lo_data(req); 1319 struct lo_dirp *d; 1320 int fd; 1321 ssize_t fh; 1322 1323 d = calloc(1, sizeof(struct lo_dirp)); 1324 if (d == NULL) { 1325 goto out_err; 1326 } 1327 1328 fd = openat(lo_fd(req, ino), ".", O_RDONLY); 1329 if (fd == -1) { 1330 goto out_errno; 1331 } 1332 1333 d->dp = fdopendir(fd); 1334 if (d->dp == NULL) { 1335 goto out_errno; 1336 } 1337 1338 d->offset = 0; 1339 d->entry = NULL; 1340 1341 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ 1342 pthread_mutex_lock(&lo->mutex); 1343 fh = lo_add_dirp_mapping(req, d); 1344 pthread_mutex_unlock(&lo->mutex); 1345 if (fh == -1) { 1346 goto out_err; 1347 } 1348 1349 fi->fh = fh; 1350 if (lo->cache == CACHE_ALWAYS) { 1351 fi->cache_readdir = 1; 1352 } 1353 fuse_reply_open(req, fi); 1354 return; 1355 1356out_errno: 1357 error = errno; 1358out_err: 1359 if (d) { 1360 if (d->dp) { 1361 closedir(d->dp); 1362 } else if (fd != -1) { 1363 close(fd); 1364 } 1365 free(d); 1366 } 1367 fuse_reply_err(req, error); 1368} 1369 1370static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, 1371 off_t offset, struct fuse_file_info *fi, int plus) 1372{ 1373 struct lo_data *lo = lo_data(req); 1374 struct lo_dirp *d = NULL; 1375 struct lo_inode *dinode; 1376 char *buf = NULL; 1377 char *p; 1378 size_t rem = size; 1379 int err = EBADF; 1380 1381 dinode = lo_inode(req, ino); 1382 if (!dinode) { 1383 goto error; 1384 } 1385 1386 d = lo_dirp(req, fi); 1387 if (!d) { 1388 goto error; 1389 } 1390 1391 err = ENOMEM; 1392 buf = calloc(1, size); 1393 if (!buf) { 1394 goto error; 1395 } 1396 p = buf; 1397 1398 if (offset != d->offset) { 1399 seekdir(d->dp, offset); 1400 d->entry = NULL; 1401 d->offset = offset; 1402 } 1403 while (1) { 1404 size_t entsize; 1405 off_t nextoff; 1406 const char *name; 1407 1408 if (!d->entry) { 1409 errno = 0; 1410 d->entry = readdir(d->dp); 1411 if (!d->entry) { 1412 if (errno) { /* Error */ 1413 err = errno; 1414 goto error; 1415 } else { /* End of stream */ 1416 break; 1417 } 1418 } 1419 } 1420 nextoff = d->entry->d_off; 1421 name = d->entry->d_name; 1422 1423 fuse_ino_t entry_ino = 0; 1424 struct fuse_entry_param e = (struct fuse_entry_param){ 1425 .attr.st_ino = d->entry->d_ino, 1426 .attr.st_mode = d->entry->d_type << 12, 1427 }; 1428 1429 /* Hide root's parent directory */ 1430 if (dinode == &lo->root && strcmp(name, "..") == 0) { 1431 e.attr.st_ino = lo->root.key.ino; 1432 e.attr.st_mode = DT_DIR << 12; 1433 } 1434 1435 if (plus) { 1436 if (!is_dot_or_dotdot(name)) { 1437 err = lo_do_lookup(req, ino, name, &e); 1438 if (err) { 1439 goto error; 1440 } 1441 entry_ino = e.ino; 1442 } 1443 1444 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); 1445 } else { 1446 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); 1447 } 1448 if (entsize > rem) { 1449 if (entry_ino != 0) { 1450 lo_forget_one(req, entry_ino, 1); 1451 } 1452 break; 1453 } 1454 1455 p += entsize; 1456 rem -= entsize; 1457 1458 d->entry = NULL; 1459 d->offset = nextoff; 1460 } 1461 1462 err = 0; 1463error: 1464 lo_dirp_put(&d); 1465 lo_inode_put(lo, &dinode); 1466 1467 /* 1468 * If there's an error, we can only signal it if we haven't stored 1469 * any entries yet - otherwise we'd end up with wrong lookup 1470 * counts for the entries that are already in the buffer. So we 1471 * return what we've collected until that point. 1472 */ 1473 if (err && rem == size) { 1474 fuse_reply_err(req, err); 1475 } else { 1476 fuse_reply_buf(req, buf, size - rem); 1477 } 1478 free(buf); 1479} 1480 1481static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, 1482 off_t offset, struct fuse_file_info *fi) 1483{ 1484 lo_do_readdir(req, ino, size, offset, fi, 0); 1485} 1486 1487static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, 1488 off_t offset, struct fuse_file_info *fi) 1489{ 1490 lo_do_readdir(req, ino, size, offset, fi, 1); 1491} 1492 1493static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, 1494 struct fuse_file_info *fi) 1495{ 1496 struct lo_data *lo = lo_data(req); 1497 struct lo_map_elem *elem; 1498 struct lo_dirp *d; 1499 1500 (void)ino; 1501 1502 pthread_mutex_lock(&lo->mutex); 1503 elem = lo_map_get(&lo->dirp_map, fi->fh); 1504 if (!elem) { 1505 pthread_mutex_unlock(&lo->mutex); 1506 fuse_reply_err(req, EBADF); 1507 return; 1508 } 1509 1510 d = elem->dirp; 1511 lo_map_remove(&lo->dirp_map, fi->fh); 1512 pthread_mutex_unlock(&lo->mutex); 1513 1514 lo_dirp_put(&d); /* paired with lo_opendir() */ 1515 1516 fuse_reply_err(req, 0); 1517} 1518 1519static void update_open_flags(int writeback, struct fuse_file_info *fi) 1520{ 1521 /* 1522 * With writeback cache, kernel may send read requests even 1523 * when userspace opened write-only 1524 */ 1525 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { 1526 fi->flags &= ~O_ACCMODE; 1527 fi->flags |= O_RDWR; 1528 } 1529 1530 /* 1531 * With writeback cache, O_APPEND is handled by the kernel. 1532 * This breaks atomicity (since the file may change in the 1533 * underlying filesystem, so that the kernel's idea of the 1534 * end of the file isn't accurate anymore). In this example, 1535 * we just accept that. A more rigorous filesystem may want 1536 * to return an error here 1537 */ 1538 if (writeback && (fi->flags & O_APPEND)) { 1539 fi->flags &= ~O_APPEND; 1540 } 1541 1542 /* 1543 * O_DIRECT in guest should not necessarily mean bypassing page 1544 * cache on host as well. If somebody needs that behavior, it 1545 * probably should be a configuration knob in daemon. 1546 */ 1547 fi->flags &= ~O_DIRECT; 1548} 1549 1550static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, 1551 mode_t mode, struct fuse_file_info *fi) 1552{ 1553 int fd; 1554 struct lo_data *lo = lo_data(req); 1555 struct lo_inode *parent_inode; 1556 struct fuse_entry_param e; 1557 int err; 1558 struct lo_cred old = {}; 1559 1560 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, 1561 name); 1562 1563 if (!is_safe_path_component(name)) { 1564 fuse_reply_err(req, EINVAL); 1565 return; 1566 } 1567 1568 parent_inode = lo_inode(req, parent); 1569 if (!parent_inode) { 1570 fuse_reply_err(req, EBADF); 1571 return; 1572 } 1573 1574 err = lo_change_cred(req, &old); 1575 if (err) { 1576 goto out; 1577 } 1578 1579 update_open_flags(lo->writeback, fi); 1580 1581 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, 1582 mode); 1583 err = fd == -1 ? errno : 0; 1584 lo_restore_cred(&old); 1585 1586 if (!err) { 1587 ssize_t fh; 1588 1589 pthread_mutex_lock(&lo->mutex); 1590 fh = lo_add_fd_mapping(req, fd); 1591 pthread_mutex_unlock(&lo->mutex); 1592 if (fh == -1) { 1593 close(fd); 1594 err = ENOMEM; 1595 goto out; 1596 } 1597 1598 fi->fh = fh; 1599 err = lo_do_lookup(req, parent, name, &e); 1600 } 1601 if (lo->cache == CACHE_NONE) { 1602 fi->direct_io = 1; 1603 } else if (lo->cache == CACHE_ALWAYS) { 1604 fi->keep_cache = 1; 1605 } 1606 1607out: 1608 lo_inode_put(lo, &parent_inode); 1609 1610 if (err) { 1611 fuse_reply_err(req, err); 1612 } else { 1613 fuse_reply_create(req, &e, fi); 1614 } 1615} 1616 1617/* Should be called with inode->plock_mutex held */ 1618static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, 1619 struct lo_inode *inode, 1620 uint64_t lock_owner, 1621 pid_t pid, int *err) 1622{ 1623 struct lo_inode_plock *plock; 1624 char procname[64]; 1625 int fd; 1626 1627 plock = 1628 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); 1629 1630 if (plock) { 1631 return plock; 1632 } 1633 1634 plock = malloc(sizeof(struct lo_inode_plock)); 1635 if (!plock) { 1636 *err = ENOMEM; 1637 return NULL; 1638 } 1639 1640 /* Open another instance of file which can be used for ofd locks. */ 1641 sprintf(procname, "%i", inode->fd); 1642 1643 /* TODO: What if file is not writable? */ 1644 fd = openat(lo->proc_self_fd, procname, O_RDWR); 1645 if (fd == -1) { 1646 *err = errno; 1647 free(plock); 1648 return NULL; 1649 } 1650 1651 plock->lock_owner = lock_owner; 1652 plock->fd = fd; 1653 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), 1654 plock); 1655 return plock; 1656} 1657 1658static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 1659 struct flock *lock) 1660{ 1661 struct lo_data *lo = lo_data(req); 1662 struct lo_inode *inode; 1663 struct lo_inode_plock *plock; 1664 int ret, saverr = 0; 1665 1666 fuse_log(FUSE_LOG_DEBUG, 1667 "lo_getlk(ino=%" PRIu64 ", flags=%d)" 1668 " owner=0x%lx, l_type=%d l_start=0x%lx" 1669 " l_len=0x%lx\n", 1670 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start, 1671 lock->l_len); 1672 1673 inode = lo_inode(req, ino); 1674 if (!inode) { 1675 fuse_reply_err(req, EBADF); 1676 return; 1677 } 1678 1679 pthread_mutex_lock(&inode->plock_mutex); 1680 plock = 1681 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); 1682 if (!plock) { 1683 saverr = ret; 1684 goto out; 1685 } 1686 1687 ret = fcntl(plock->fd, F_OFD_GETLK, lock); 1688 if (ret == -1) { 1689 saverr = errno; 1690 } 1691 1692out: 1693 pthread_mutex_unlock(&inode->plock_mutex); 1694 lo_inode_put(lo, &inode); 1695 1696 if (saverr) { 1697 fuse_reply_err(req, saverr); 1698 } else { 1699 fuse_reply_lock(req, lock); 1700 } 1701} 1702 1703static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 1704 struct flock *lock, int sleep) 1705{ 1706 struct lo_data *lo = lo_data(req); 1707 struct lo_inode *inode; 1708 struct lo_inode_plock *plock; 1709 int ret, saverr = 0; 1710 1711 fuse_log(FUSE_LOG_DEBUG, 1712 "lo_setlk(ino=%" PRIu64 ", flags=%d)" 1713 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d" 1714 " l_start=0x%lx l_len=0x%lx\n", 1715 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, 1716 lock->l_whence, lock->l_start, lock->l_len); 1717 1718 if (sleep) { 1719 fuse_reply_err(req, EOPNOTSUPP); 1720 return; 1721 } 1722 1723 inode = lo_inode(req, ino); 1724 if (!inode) { 1725 fuse_reply_err(req, EBADF); 1726 return; 1727 } 1728 1729 pthread_mutex_lock(&inode->plock_mutex); 1730 plock = 1731 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); 1732 1733 if (!plock) { 1734 saverr = ret; 1735 goto out; 1736 } 1737 1738 /* TODO: Is it alright to modify flock? */ 1739 lock->l_pid = 0; 1740 ret = fcntl(plock->fd, F_OFD_SETLK, lock); 1741 if (ret == -1) { 1742 saverr = errno; 1743 } 1744 1745out: 1746 pthread_mutex_unlock(&inode->plock_mutex); 1747 lo_inode_put(lo, &inode); 1748 1749 fuse_reply_err(req, saverr); 1750} 1751 1752static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, 1753 struct fuse_file_info *fi) 1754{ 1755 int res; 1756 struct lo_dirp *d; 1757 int fd; 1758 1759 (void)ino; 1760 1761 d = lo_dirp(req, fi); 1762 if (!d) { 1763 fuse_reply_err(req, EBADF); 1764 return; 1765 } 1766 1767 fd = dirfd(d->dp); 1768 if (datasync) { 1769 res = fdatasync(fd); 1770 } else { 1771 res = fsync(fd); 1772 } 1773 1774 lo_dirp_put(&d); 1775 1776 fuse_reply_err(req, res == -1 ? errno : 0); 1777} 1778 1779static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) 1780{ 1781 int fd; 1782 ssize_t fh; 1783 char buf[64]; 1784 struct lo_data *lo = lo_data(req); 1785 1786 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, 1787 fi->flags); 1788 1789 update_open_flags(lo->writeback, fi); 1790 1791 sprintf(buf, "%i", lo_fd(req, ino)); 1792 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); 1793 if (fd == -1) { 1794 return (void)fuse_reply_err(req, errno); 1795 } 1796 1797 pthread_mutex_lock(&lo->mutex); 1798 fh = lo_add_fd_mapping(req, fd); 1799 pthread_mutex_unlock(&lo->mutex); 1800 if (fh == -1) { 1801 close(fd); 1802 fuse_reply_err(req, ENOMEM); 1803 return; 1804 } 1805 1806 fi->fh = fh; 1807 if (lo->cache == CACHE_NONE) { 1808 fi->direct_io = 1; 1809 } else if (lo->cache == CACHE_ALWAYS) { 1810 fi->keep_cache = 1; 1811 } 1812 fuse_reply_open(req, fi); 1813} 1814 1815static void lo_release(fuse_req_t req, fuse_ino_t ino, 1816 struct fuse_file_info *fi) 1817{ 1818 struct lo_data *lo = lo_data(req); 1819 struct lo_map_elem *elem; 1820 int fd = -1; 1821 1822 (void)ino; 1823 1824 pthread_mutex_lock(&lo->mutex); 1825 elem = lo_map_get(&lo->fd_map, fi->fh); 1826 if (elem) { 1827 fd = elem->fd; 1828 elem = NULL; 1829 lo_map_remove(&lo->fd_map, fi->fh); 1830 } 1831 pthread_mutex_unlock(&lo->mutex); 1832 1833 close(fd); 1834 fuse_reply_err(req, 0); 1835} 1836 1837static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) 1838{ 1839 int res; 1840 (void)ino; 1841 struct lo_inode *inode; 1842 1843 inode = lo_inode(req, ino); 1844 if (!inode) { 1845 fuse_reply_err(req, EBADF); 1846 return; 1847 } 1848 1849 /* An fd is going away. Cleanup associated posix locks */ 1850 pthread_mutex_lock(&inode->plock_mutex); 1851 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner)); 1852 pthread_mutex_unlock(&inode->plock_mutex); 1853 1854 res = close(dup(lo_fi_fd(req, fi))); 1855 lo_inode_put(lo_data(req), &inode); 1856 fuse_reply_err(req, res == -1 ? errno : 0); 1857} 1858 1859static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, 1860 struct fuse_file_info *fi) 1861{ 1862 int res; 1863 int fd; 1864 char *buf; 1865 1866 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, 1867 (void *)fi); 1868 1869 if (!fi) { 1870 struct lo_data *lo = lo_data(req); 1871 1872 res = asprintf(&buf, "%i", lo_fd(req, ino)); 1873 if (res == -1) { 1874 return (void)fuse_reply_err(req, errno); 1875 } 1876 1877 fd = openat(lo->proc_self_fd, buf, O_RDWR); 1878 free(buf); 1879 if (fd == -1) { 1880 return (void)fuse_reply_err(req, errno); 1881 } 1882 } else { 1883 fd = lo_fi_fd(req, fi); 1884 } 1885 1886 if (datasync) { 1887 res = fdatasync(fd); 1888 } else { 1889 res = fsync(fd); 1890 } 1891 if (!fi) { 1892 close(fd); 1893 } 1894 fuse_reply_err(req, res == -1 ? errno : 0); 1895} 1896 1897static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, 1898 struct fuse_file_info *fi) 1899{ 1900 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); 1901 1902 fuse_log(FUSE_LOG_DEBUG, 1903 "lo_read(ino=%" PRIu64 ", size=%zd, " 1904 "off=%lu)\n", 1905 ino, size, (unsigned long)offset); 1906 1907 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; 1908 buf.buf[0].fd = lo_fi_fd(req, fi); 1909 buf.buf[0].pos = offset; 1910 1911 fuse_reply_data(req, &buf); 1912} 1913 1914static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, 1915 struct fuse_bufvec *in_buf, off_t off, 1916 struct fuse_file_info *fi) 1917{ 1918 (void)ino; 1919 ssize_t res; 1920 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); 1921 bool cap_fsetid_dropped = false; 1922 1923 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; 1924 out_buf.buf[0].fd = lo_fi_fd(req, fi); 1925 out_buf.buf[0].pos = off; 1926 1927 fuse_log(FUSE_LOG_DEBUG, 1928 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, 1929 out_buf.buf[0].size, (unsigned long)off); 1930 1931 /* 1932 * If kill_priv is set, drop CAP_FSETID which should lead to kernel 1933 * clearing setuid/setgid on file. 1934 */ 1935 if (fi->kill_priv) { 1936 res = drop_effective_cap("FSETID", &cap_fsetid_dropped); 1937 if (res != 0) { 1938 fuse_reply_err(req, res); 1939 return; 1940 } 1941 } 1942 1943 res = fuse_buf_copy(&out_buf, in_buf); 1944 if (res < 0) { 1945 fuse_reply_err(req, -res); 1946 } else { 1947 fuse_reply_write(req, (size_t)res); 1948 } 1949 1950 if (cap_fsetid_dropped) { 1951 res = gain_effective_cap("FSETID"); 1952 if (res) { 1953 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); 1954 } 1955 } 1956} 1957 1958static void lo_statfs(fuse_req_t req, fuse_ino_t ino) 1959{ 1960 int res; 1961 struct statvfs stbuf; 1962 1963 res = fstatvfs(lo_fd(req, ino), &stbuf); 1964 if (res == -1) { 1965 fuse_reply_err(req, errno); 1966 } else { 1967 fuse_reply_statfs(req, &stbuf); 1968 } 1969} 1970 1971static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, 1972 off_t length, struct fuse_file_info *fi) 1973{ 1974 int err = EOPNOTSUPP; 1975 (void)ino; 1976 1977#ifdef CONFIG_FALLOCATE 1978 err = fallocate(lo_fi_fd(req, fi), mode, offset, length); 1979 if (err < 0) { 1980 err = errno; 1981 } 1982 1983#elif defined(CONFIG_POSIX_FALLOCATE) 1984 if (mode) { 1985 fuse_reply_err(req, EOPNOTSUPP); 1986 return; 1987 } 1988 1989 err = posix_fallocate(lo_fi_fd(req, fi), offset, length); 1990#endif 1991 1992 fuse_reply_err(req, err); 1993} 1994 1995static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 1996 int op) 1997{ 1998 int res; 1999 (void)ino; 2000 2001 res = flock(lo_fi_fd(req, fi), op); 2002 2003 fuse_reply_err(req, res == -1 ? errno : 0); 2004} 2005 2006static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, 2007 size_t size) 2008{ 2009 struct lo_data *lo = lo_data(req); 2010 char *value = NULL; 2011 char procname[64]; 2012 struct lo_inode *inode; 2013 ssize_t ret; 2014 int saverr; 2015 int fd = -1; 2016 2017 inode = lo_inode(req, ino); 2018 if (!inode) { 2019 fuse_reply_err(req, EBADF); 2020 return; 2021 } 2022 2023 saverr = ENOSYS; 2024 if (!lo_data(req)->xattr) { 2025 goto out; 2026 } 2027 2028 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", 2029 ino, name, size); 2030 2031 if (size) { 2032 value = malloc(size); 2033 if (!value) { 2034 goto out_err; 2035 } 2036 } 2037 2038 sprintf(procname, "%i", inode->fd); 2039 /* 2040 * It is not safe to open() non-regular/non-dir files in file server 2041 * unless O_PATH is used, so use that method for regular files/dir 2042 * only (as it seems giving less performance overhead). 2043 * Otherwise, call fchdir() to avoid open(). 2044 */ 2045 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2046 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2047 if (fd < 0) { 2048 goto out_err; 2049 } 2050 ret = fgetxattr(fd, name, value, size); 2051 } else { 2052 /* fchdir should not fail here */ 2053 assert(fchdir(lo->proc_self_fd) == 0); 2054 ret = getxattr(procname, name, value, size); 2055 assert(fchdir(lo->root.fd) == 0); 2056 } 2057 2058 if (ret == -1) { 2059 goto out_err; 2060 } 2061 if (size) { 2062 saverr = 0; 2063 if (ret == 0) { 2064 goto out; 2065 } 2066 fuse_reply_buf(req, value, ret); 2067 } else { 2068 fuse_reply_xattr(req, ret); 2069 } 2070out_free: 2071 free(value); 2072 2073 if (fd >= 0) { 2074 close(fd); 2075 } 2076 2077 lo_inode_put(lo, &inode); 2078 return; 2079 2080out_err: 2081 saverr = errno; 2082out: 2083 fuse_reply_err(req, saverr); 2084 goto out_free; 2085} 2086 2087static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) 2088{ 2089 struct lo_data *lo = lo_data(req); 2090 char *value = NULL; 2091 char procname[64]; 2092 struct lo_inode *inode; 2093 ssize_t ret; 2094 int saverr; 2095 int fd = -1; 2096 2097 inode = lo_inode(req, ino); 2098 if (!inode) { 2099 fuse_reply_err(req, EBADF); 2100 return; 2101 } 2102 2103 saverr = ENOSYS; 2104 if (!lo_data(req)->xattr) { 2105 goto out; 2106 } 2107 2108 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, 2109 size); 2110 2111 if (size) { 2112 value = malloc(size); 2113 if (!value) { 2114 goto out_err; 2115 } 2116 } 2117 2118 sprintf(procname, "%i", inode->fd); 2119 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2120 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2121 if (fd < 0) { 2122 goto out_err; 2123 } 2124 ret = flistxattr(fd, value, size); 2125 } else { 2126 /* fchdir should not fail here */ 2127 assert(fchdir(lo->proc_self_fd) == 0); 2128 ret = listxattr(procname, value, size); 2129 assert(fchdir(lo->root.fd) == 0); 2130 } 2131 2132 if (ret == -1) { 2133 goto out_err; 2134 } 2135 if (size) { 2136 saverr = 0; 2137 if (ret == 0) { 2138 goto out; 2139 } 2140 fuse_reply_buf(req, value, ret); 2141 } else { 2142 fuse_reply_xattr(req, ret); 2143 } 2144out_free: 2145 free(value); 2146 2147 if (fd >= 0) { 2148 close(fd); 2149 } 2150 2151 lo_inode_put(lo, &inode); 2152 return; 2153 2154out_err: 2155 saverr = errno; 2156out: 2157 fuse_reply_err(req, saverr); 2158 goto out_free; 2159} 2160 2161static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, 2162 const char *value, size_t size, int flags) 2163{ 2164 char procname[64]; 2165 struct lo_data *lo = lo_data(req); 2166 struct lo_inode *inode; 2167 ssize_t ret; 2168 int saverr; 2169 int fd = -1; 2170 2171 inode = lo_inode(req, ino); 2172 if (!inode) { 2173 fuse_reply_err(req, EBADF); 2174 return; 2175 } 2176 2177 saverr = ENOSYS; 2178 if (!lo_data(req)->xattr) { 2179 goto out; 2180 } 2181 2182 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 2183 ", name=%s value=%s size=%zd)\n", ino, name, value, size); 2184 2185 sprintf(procname, "%i", inode->fd); 2186 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2187 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2188 if (fd < 0) { 2189 saverr = errno; 2190 goto out; 2191 } 2192 ret = fsetxattr(fd, name, value, size, flags); 2193 } else { 2194 /* fchdir should not fail here */ 2195 assert(fchdir(lo->proc_self_fd) == 0); 2196 ret = setxattr(procname, name, value, size, flags); 2197 assert(fchdir(lo->root.fd) == 0); 2198 } 2199 2200 saverr = ret == -1 ? errno : 0; 2201 2202out: 2203 if (fd >= 0) { 2204 close(fd); 2205 } 2206 2207 lo_inode_put(lo, &inode); 2208 fuse_reply_err(req, saverr); 2209} 2210 2211static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) 2212{ 2213 char procname[64]; 2214 struct lo_data *lo = lo_data(req); 2215 struct lo_inode *inode; 2216 ssize_t ret; 2217 int saverr; 2218 int fd = -1; 2219 2220 inode = lo_inode(req, ino); 2221 if (!inode) { 2222 fuse_reply_err(req, EBADF); 2223 return; 2224 } 2225 2226 saverr = ENOSYS; 2227 if (!lo_data(req)->xattr) { 2228 goto out; 2229 } 2230 2231 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, 2232 name); 2233 2234 sprintf(procname, "%i", inode->fd); 2235 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2236 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2237 if (fd < 0) { 2238 saverr = errno; 2239 goto out; 2240 } 2241 ret = fremovexattr(fd, name); 2242 } else { 2243 /* fchdir should not fail here */ 2244 assert(fchdir(lo->proc_self_fd) == 0); 2245 ret = removexattr(procname, name); 2246 assert(fchdir(lo->root.fd) == 0); 2247 } 2248 2249 saverr = ret == -1 ? errno : 0; 2250 2251out: 2252 if (fd >= 0) { 2253 close(fd); 2254 } 2255 2256 lo_inode_put(lo, &inode); 2257 fuse_reply_err(req, saverr); 2258} 2259 2260#ifdef HAVE_COPY_FILE_RANGE 2261static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, 2262 struct fuse_file_info *fi_in, fuse_ino_t ino_out, 2263 off_t off_out, struct fuse_file_info *fi_out, 2264 size_t len, int flags) 2265{ 2266 int in_fd, out_fd; 2267 ssize_t res; 2268 2269 in_fd = lo_fi_fd(req, fi_in); 2270 out_fd = lo_fi_fd(req, fi_out); 2271 2272 fuse_log(FUSE_LOG_DEBUG, 2273 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " 2274 "off=%lu, ino=%" PRIu64 "/fd=%d, " 2275 "off=%lu, size=%zd, flags=0x%x)\n", 2276 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags); 2277 2278 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); 2279 if (res < 0) { 2280 fuse_reply_err(req, errno); 2281 } else { 2282 fuse_reply_write(req, res); 2283 } 2284} 2285#endif 2286 2287static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, 2288 struct fuse_file_info *fi) 2289{ 2290 off_t res; 2291 2292 (void)ino; 2293 res = lseek(lo_fi_fd(req, fi), off, whence); 2294 if (res != -1) { 2295 fuse_reply_lseek(req, res); 2296 } else { 2297 fuse_reply_err(req, errno); 2298 } 2299} 2300 2301static void lo_destroy(void *userdata) 2302{ 2303 struct lo_data *lo = (struct lo_data *)userdata; 2304 2305 pthread_mutex_lock(&lo->mutex); 2306 while (true) { 2307 GHashTableIter iter; 2308 gpointer key, value; 2309 2310 g_hash_table_iter_init(&iter, lo->inodes); 2311 if (!g_hash_table_iter_next(&iter, &key, &value)) { 2312 break; 2313 } 2314 2315 struct lo_inode *inode = value; 2316 unref_inode(lo, inode, inode->nlookup); 2317 } 2318 pthread_mutex_unlock(&lo->mutex); 2319} 2320 2321static struct fuse_lowlevel_ops lo_oper = { 2322 .init = lo_init, 2323 .lookup = lo_lookup, 2324 .mkdir = lo_mkdir, 2325 .mknod = lo_mknod, 2326 .symlink = lo_symlink, 2327 .link = lo_link, 2328 .unlink = lo_unlink, 2329 .rmdir = lo_rmdir, 2330 .rename = lo_rename, 2331 .forget = lo_forget, 2332 .forget_multi = lo_forget_multi, 2333 .getattr = lo_getattr, 2334 .setattr = lo_setattr, 2335 .readlink = lo_readlink, 2336 .opendir = lo_opendir, 2337 .readdir = lo_readdir, 2338 .readdirplus = lo_readdirplus, 2339 .releasedir = lo_releasedir, 2340 .fsyncdir = lo_fsyncdir, 2341 .create = lo_create, 2342 .getlk = lo_getlk, 2343 .setlk = lo_setlk, 2344 .open = lo_open, 2345 .release = lo_release, 2346 .flush = lo_flush, 2347 .fsync = lo_fsync, 2348 .read = lo_read, 2349 .write_buf = lo_write_buf, 2350 .statfs = lo_statfs, 2351 .fallocate = lo_fallocate, 2352 .flock = lo_flock, 2353 .getxattr = lo_getxattr, 2354 .listxattr = lo_listxattr, 2355 .setxattr = lo_setxattr, 2356 .removexattr = lo_removexattr, 2357#ifdef HAVE_COPY_FILE_RANGE 2358 .copy_file_range = lo_copy_file_range, 2359#endif 2360 .lseek = lo_lseek, 2361 .destroy = lo_destroy, 2362}; 2363 2364/* Print vhost-user.json backend program capabilities */ 2365static void print_capabilities(void) 2366{ 2367 printf("{\n"); 2368 printf(" \"type\": \"fs\"\n"); 2369 printf("}\n"); 2370} 2371 2372/* 2373 * Drop all Linux capabilities because the wait parent process only needs to 2374 * sit in waitpid(2) and terminate. 2375 */ 2376static void setup_wait_parent_capabilities(void) 2377{ 2378 capng_setpid(syscall(SYS_gettid)); 2379 capng_clear(CAPNG_SELECT_BOTH); 2380 capng_apply(CAPNG_SELECT_BOTH); 2381} 2382 2383/* 2384 * Move to a new mount, net, and pid namespaces to isolate this process. 2385 */ 2386static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) 2387{ 2388 pid_t child; 2389 char template[] = "virtiofsd-XXXXXX"; 2390 char *tmpdir; 2391 2392 /* 2393 * Create a new pid namespace for *child* processes. We'll have to 2394 * fork in order to enter the new pid namespace. A new mount namespace 2395 * is also needed so that we can remount /proc for the new pid 2396 * namespace. 2397 * 2398 * Our UNIX domain sockets have been created. Now we can move to 2399 * an empty network namespace to prevent TCP/IP and other network 2400 * activity in case this process is compromised. 2401 */ 2402 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { 2403 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); 2404 exit(1); 2405 } 2406 2407 child = fork(); 2408 if (child < 0) { 2409 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); 2410 exit(1); 2411 } 2412 if (child > 0) { 2413 pid_t waited; 2414 int wstatus; 2415 2416 setup_wait_parent_capabilities(); 2417 2418 /* The parent waits for the child */ 2419 do { 2420 waited = waitpid(child, &wstatus, 0); 2421 } while (waited < 0 && errno == EINTR && !se->exited); 2422 2423 /* We were terminated by a signal, see fuse_signals.c */ 2424 if (se->exited) { 2425 exit(0); 2426 } 2427 2428 if (WIFEXITED(wstatus)) { 2429 exit(WEXITSTATUS(wstatus)); 2430 } 2431 2432 exit(1); 2433 } 2434 2435 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ 2436 prctl(PR_SET_PDEATHSIG, SIGTERM); 2437 2438 /* 2439 * If the mounts have shared propagation then we want to opt out so our 2440 * mount changes don't affect the parent mount namespace. 2441 */ 2442 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { 2443 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); 2444 exit(1); 2445 } 2446 2447 /* The child must remount /proc to use the new pid namespace */ 2448 if (mount("proc", "/proc", "proc", 2449 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { 2450 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); 2451 exit(1); 2452 } 2453 2454 tmpdir = mkdtemp(template); 2455 if (!tmpdir) { 2456 fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); 2457 exit(1); 2458 } 2459 2460 if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { 2461 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", 2462 tmpdir); 2463 exit(1); 2464 } 2465 2466 /* Now we can get our /proc/self/fd directory file descriptor */ 2467 lo->proc_self_fd = open(tmpdir, O_PATH); 2468 if (lo->proc_self_fd == -1) { 2469 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); 2470 exit(1); 2471 } 2472 2473 if (umount2(tmpdir, MNT_DETACH) < 0) { 2474 fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); 2475 exit(1); 2476 } 2477 2478 if (rmdir(tmpdir) < 0) { 2479 fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); 2480 } 2481} 2482 2483/* 2484 * Capture the capability state, we'll need to restore this for individual 2485 * threads later; see load_capng. 2486 */ 2487static void setup_capng(void) 2488{ 2489 /* Note this accesses /proc so has to happen before the sandbox */ 2490 if (capng_get_caps_process()) { 2491 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); 2492 exit(1); 2493 } 2494 pthread_mutex_init(&cap.mutex, NULL); 2495 pthread_mutex_lock(&cap.mutex); 2496 cap.saved = capng_save_state(); 2497 if (!cap.saved) { 2498 fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); 2499 exit(1); 2500 } 2501 pthread_mutex_unlock(&cap.mutex); 2502} 2503 2504static void cleanup_capng(void) 2505{ 2506 free(cap.saved); 2507 cap.saved = NULL; 2508 pthread_mutex_destroy(&cap.mutex); 2509} 2510 2511 2512/* 2513 * Make the source directory our root so symlinks cannot escape and no other 2514 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. 2515 */ 2516static void setup_mounts(const char *source) 2517{ 2518 int oldroot; 2519 int newroot; 2520 2521 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { 2522 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); 2523 exit(1); 2524 } 2525 2526 /* This magic is based on lxc's lxc_pivot_root() */ 2527 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 2528 if (oldroot < 0) { 2529 fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); 2530 exit(1); 2531 } 2532 2533 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 2534 if (newroot < 0) { 2535 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); 2536 exit(1); 2537 } 2538 2539 if (fchdir(newroot) < 0) { 2540 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); 2541 exit(1); 2542 } 2543 2544 if (syscall(__NR_pivot_root, ".", ".") < 0) { 2545 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); 2546 exit(1); 2547 } 2548 2549 if (fchdir(oldroot) < 0) { 2550 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); 2551 exit(1); 2552 } 2553 2554 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { 2555 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); 2556 exit(1); 2557 } 2558 2559 if (umount2(".", MNT_DETACH) < 0) { 2560 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); 2561 exit(1); 2562 } 2563 2564 if (fchdir(newroot) < 0) { 2565 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); 2566 exit(1); 2567 } 2568 2569 close(newroot); 2570 close(oldroot); 2571} 2572 2573/* 2574 * Only keep whitelisted capabilities that are needed for file system operation 2575 * The (possibly NULL) modcaps_in string passed in is free'd before exit. 2576 */ 2577static void setup_capabilities(char *modcaps_in) 2578{ 2579 char *modcaps = modcaps_in; 2580 pthread_mutex_lock(&cap.mutex); 2581 capng_restore_state(&cap.saved); 2582 2583 /* 2584 * Whitelist file system-related capabilities that are needed for a file 2585 * server to act like root. Drop everything else like networking and 2586 * sysadmin capabilities. 2587 * 2588 * Exclusions: 2589 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl 2590 * and we don't support that. 2591 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be 2592 * used by the Smack LSM. Omit it until there is demand for it. 2593 */ 2594 capng_setpid(syscall(SYS_gettid)); 2595 capng_clear(CAPNG_SELECT_BOTH); 2596 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, 2597 CAP_CHOWN, 2598 CAP_DAC_OVERRIDE, 2599 CAP_DAC_READ_SEARCH, 2600 CAP_FOWNER, 2601 CAP_FSETID, 2602 CAP_SETGID, 2603 CAP_SETUID, 2604 CAP_MKNOD, 2605 CAP_SETFCAP, 2606 -1)) { 2607 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__); 2608 exit(1); 2609 } 2610 2611 /* 2612 * The modcaps option is a colon separated list of caps, 2613 * each preceded by either + or -. 2614 */ 2615 while (modcaps) { 2616 capng_act_t action; 2617 int cap; 2618 2619 char *next = strchr(modcaps, ':'); 2620 if (next) { 2621 *next = '\0'; 2622 next++; 2623 } 2624 2625 switch (modcaps[0]) { 2626 case '+': 2627 action = CAPNG_ADD; 2628 break; 2629 2630 case '-': 2631 action = CAPNG_DROP; 2632 break; 2633 2634 default: 2635 fuse_log(FUSE_LOG_ERR, 2636 "%s: Expecting '+'/'-' in modcaps but found '%c'\n", 2637 __func__, modcaps[0]); 2638 exit(1); 2639 } 2640 cap = capng_name_to_capability(modcaps + 1); 2641 if (cap < 0) { 2642 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__, 2643 modcaps); 2644 exit(1); 2645 } 2646 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) { 2647 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n", 2648 __func__, modcaps); 2649 exit(1); 2650 } 2651 2652 modcaps = next; 2653 } 2654 g_free(modcaps_in); 2655 2656 if (capng_apply(CAPNG_SELECT_BOTH)) { 2657 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__); 2658 exit(1); 2659 } 2660 2661 cap.saved = capng_save_state(); 2662 if (!cap.saved) { 2663 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__); 2664 exit(1); 2665 } 2666 pthread_mutex_unlock(&cap.mutex); 2667} 2668 2669/* 2670 * Lock down this process to prevent access to other processes or files outside 2671 * source directory. This reduces the impact of arbitrary code execution bugs. 2672 */ 2673static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, 2674 bool enable_syslog) 2675{ 2676 setup_namespaces(lo, se); 2677 setup_mounts(lo->source); 2678 setup_seccomp(enable_syslog); 2679 setup_capabilities(g_strdup(lo->modcaps)); 2680} 2681 2682/* Set the maximum number of open file descriptors */ 2683static void setup_nofile_rlimit(unsigned long rlimit_nofile) 2684{ 2685 struct rlimit rlim = { 2686 .rlim_cur = rlimit_nofile, 2687 .rlim_max = rlimit_nofile, 2688 }; 2689 2690 if (rlimit_nofile == 0) { 2691 return; /* nothing to do */ 2692 } 2693 2694 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { 2695 /* Ignore SELinux denials */ 2696 if (errno == EPERM) { 2697 return; 2698 } 2699 2700 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); 2701 exit(1); 2702 } 2703} 2704 2705static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) 2706{ 2707 g_autofree char *localfmt = NULL; 2708 2709 if (current_log_level < level) { 2710 return; 2711 } 2712 2713 if (current_log_level == FUSE_LOG_DEBUG) { 2714 if (!use_syslog) { 2715 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s", 2716 get_clock(), syscall(__NR_gettid), fmt); 2717 } else { 2718 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), 2719 fmt); 2720 } 2721 fmt = localfmt; 2722 } 2723 2724 if (use_syslog) { 2725 int priority = LOG_ERR; 2726 switch (level) { 2727 case FUSE_LOG_EMERG: 2728 priority = LOG_EMERG; 2729 break; 2730 case FUSE_LOG_ALERT: 2731 priority = LOG_ALERT; 2732 break; 2733 case FUSE_LOG_CRIT: 2734 priority = LOG_CRIT; 2735 break; 2736 case FUSE_LOG_ERR: 2737 priority = LOG_ERR; 2738 break; 2739 case FUSE_LOG_WARNING: 2740 priority = LOG_WARNING; 2741 break; 2742 case FUSE_LOG_NOTICE: 2743 priority = LOG_NOTICE; 2744 break; 2745 case FUSE_LOG_INFO: 2746 priority = LOG_INFO; 2747 break; 2748 case FUSE_LOG_DEBUG: 2749 priority = LOG_DEBUG; 2750 break; 2751 } 2752 vsyslog(priority, fmt, ap); 2753 } else { 2754 vfprintf(stderr, fmt, ap); 2755 } 2756} 2757 2758static void setup_root(struct lo_data *lo, struct lo_inode *root) 2759{ 2760 int fd, res; 2761 struct stat stat; 2762 2763 fd = open("/", O_PATH); 2764 if (fd == -1) { 2765 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); 2766 exit(1); 2767 } 2768 2769 res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 2770 if (res == -1) { 2771 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); 2772 exit(1); 2773 } 2774 2775 root->filetype = S_IFDIR; 2776 root->fd = fd; 2777 root->key.ino = stat.st_ino; 2778 root->key.dev = stat.st_dev; 2779 root->nlookup = 2; 2780 g_atomic_int_set(&root->refcount, 2); 2781} 2782 2783static guint lo_key_hash(gconstpointer key) 2784{ 2785 const struct lo_key *lkey = key; 2786 2787 return (guint)lkey->ino + (guint)lkey->dev; 2788} 2789 2790static gboolean lo_key_equal(gconstpointer a, gconstpointer b) 2791{ 2792 const struct lo_key *la = a; 2793 const struct lo_key *lb = b; 2794 2795 return la->ino == lb->ino && la->dev == lb->dev; 2796} 2797 2798static void fuse_lo_data_cleanup(struct lo_data *lo) 2799{ 2800 if (lo->inodes) { 2801 g_hash_table_destroy(lo->inodes); 2802 } 2803 lo_map_destroy(&lo->fd_map); 2804 lo_map_destroy(&lo->dirp_map); 2805 lo_map_destroy(&lo->ino_map); 2806 2807 if (lo->proc_self_fd >= 0) { 2808 close(lo->proc_self_fd); 2809 } 2810 2811 if (lo->root.fd >= 0) { 2812 close(lo->root.fd); 2813 } 2814 2815 free(lo->source); 2816} 2817 2818int main(int argc, char *argv[]) 2819{ 2820 struct fuse_args args = FUSE_ARGS_INIT(argc, argv); 2821 struct fuse_session *se; 2822 struct fuse_cmdline_opts opts; 2823 struct lo_data lo = { 2824 .debug = 0, 2825 .writeback = 0, 2826 .posix_lock = 1, 2827 .proc_self_fd = -1, 2828 }; 2829 struct lo_map_elem *root_elem; 2830 int ret = -1; 2831 2832 /* Don't mask creation mode, kernel already did that */ 2833 umask(0); 2834 2835 pthread_mutex_init(&lo.mutex, NULL); 2836 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); 2837 lo.root.fd = -1; 2838 lo.root.fuse_ino = FUSE_ROOT_ID; 2839 lo.cache = CACHE_AUTO; 2840 2841 /* 2842 * Set up the ino map like this: 2843 * [0] Reserved (will not be used) 2844 * [1] Root inode 2845 */ 2846 lo_map_init(&lo.ino_map); 2847 lo_map_reserve(&lo.ino_map, 0)->in_use = false; 2848 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); 2849 root_elem->inode = &lo.root; 2850 2851 lo_map_init(&lo.dirp_map); 2852 lo_map_init(&lo.fd_map); 2853 2854 if (fuse_parse_cmdline(&args, &opts) != 0) { 2855 goto err_out1; 2856 } 2857 fuse_set_log_func(log_func); 2858 use_syslog = opts.syslog; 2859 if (use_syslog) { 2860 openlog("virtiofsd", LOG_PID, LOG_DAEMON); 2861 } 2862 2863 if (opts.show_help) { 2864 printf("usage: %s [options]\n\n", argv[0]); 2865 fuse_cmdline_help(); 2866 printf(" -o source=PATH shared directory tree\n"); 2867 fuse_lowlevel_help(); 2868 ret = 0; 2869 goto err_out1; 2870 } else if (opts.show_version) { 2871 fuse_lowlevel_version(); 2872 ret = 0; 2873 goto err_out1; 2874 } else if (opts.print_capabilities) { 2875 print_capabilities(); 2876 ret = 0; 2877 goto err_out1; 2878 } 2879 2880 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { 2881 goto err_out1; 2882 } 2883 2884 /* 2885 * log_level is 0 if not configured via cmd options (0 is LOG_EMERG, 2886 * and we don't use this log level). 2887 */ 2888 if (opts.log_level != 0) { 2889 current_log_level = opts.log_level; 2890 } 2891 lo.debug = opts.debug; 2892 if (lo.debug) { 2893 current_log_level = FUSE_LOG_DEBUG; 2894 } 2895 if (lo.source) { 2896 struct stat stat; 2897 int res; 2898 2899 res = lstat(lo.source, &stat); 2900 if (res == -1) { 2901 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", 2902 lo.source); 2903 exit(1); 2904 } 2905 if (!S_ISDIR(stat.st_mode)) { 2906 fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); 2907 exit(1); 2908 } 2909 } else { 2910 lo.source = strdup("/"); 2911 } 2912 if (!lo.timeout_set) { 2913 switch (lo.cache) { 2914 case CACHE_NONE: 2915 lo.timeout = 0.0; 2916 break; 2917 2918 case CACHE_AUTO: 2919 lo.timeout = 1.0; 2920 break; 2921 2922 case CACHE_ALWAYS: 2923 lo.timeout = 86400.0; 2924 break; 2925 } 2926 } else if (lo.timeout < 0) { 2927 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); 2928 exit(1); 2929 } 2930 2931 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); 2932 if (se == NULL) { 2933 goto err_out1; 2934 } 2935 2936 if (fuse_set_signal_handlers(se) != 0) { 2937 goto err_out2; 2938 } 2939 2940 if (fuse_session_mount(se) != 0) { 2941 goto err_out3; 2942 } 2943 2944 fuse_daemonize(opts.foreground); 2945 2946 setup_nofile_rlimit(opts.rlimit_nofile); 2947 2948 /* Must be before sandbox since it wants /proc */ 2949 setup_capng(); 2950 2951 setup_sandbox(&lo, se, opts.syslog); 2952 2953 setup_root(&lo, &lo.root); 2954 /* Block until ctrl+c or fusermount -u */ 2955 ret = virtio_loop(se); 2956 2957 fuse_session_unmount(se); 2958 cleanup_capng(); 2959err_out3: 2960 fuse_remove_signal_handlers(se); 2961err_out2: 2962 fuse_session_destroy(se); 2963err_out1: 2964 fuse_opt_free_args(&args); 2965 2966 fuse_lo_data_cleanup(&lo); 2967 2968 return ret ? 1 : 0; 2969}