qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio
at master 3328 lines 93 kB view raw
1/* 2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License version 6 * 2 as published by the Free Software Foundation. 7 * 8 * You should have received a copy of the GNU General Public License 9 * along with this program. If not, see <http://www.gnu.org/licenses/>. 10 * 11 * Contributions after 2012-01-13 are licensed under the terms of the 12 * GNU GPL, version 2 or (at your option) any later version. 13 */ 14 15#include "qemu/osdep.h" 16#include "qemu-common.h" 17#include "qapi/error.h" 18#include "qapi/qapi-visit-sockets.h" 19#include "qapi/qapi-visit-block-core.h" 20#include "qapi/qmp/qdict.h" 21#include "qapi/qobject-input-visitor.h" 22#include "qapi/qobject-output-visitor.h" 23#include "qemu/uri.h" 24#include "qemu/error-report.h" 25#include "qemu/main-loop.h" 26#include "qemu/module.h" 27#include "qemu/option.h" 28#include "qemu/sockets.h" 29#include "block/block_int.h" 30#include "block/qdict.h" 31#include "sysemu/block-backend.h" 32#include "qemu/bitops.h" 33#include "qemu/cutils.h" 34#include "trace.h" 35 36#define SD_PROTO_VER 0x01 37 38#define SD_DEFAULT_ADDR "localhost" 39#define SD_DEFAULT_PORT 7000 40 41#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 42#define SD_OP_READ_OBJ 0x02 43#define SD_OP_WRITE_OBJ 0x03 44/* 0x04 is used internally by Sheepdog */ 45 46#define SD_OP_NEW_VDI 0x11 47#define SD_OP_LOCK_VDI 0x12 48#define SD_OP_RELEASE_VDI 0x13 49#define SD_OP_GET_VDI_INFO 0x14 50#define SD_OP_READ_VDIS 0x15 51#define SD_OP_FLUSH_VDI 0x16 52#define SD_OP_DEL_VDI 0x17 53#define SD_OP_GET_CLUSTER_DEFAULT 0x18 54 55#define SD_FLAG_CMD_WRITE 0x01 56#define SD_FLAG_CMD_COW 0x02 57#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */ 58#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */ 59 60#define SD_RES_SUCCESS 0x00 /* Success */ 61#define SD_RES_UNKNOWN 0x01 /* Unknown error */ 62#define SD_RES_NO_OBJ 0x02 /* No object found */ 63#define SD_RES_EIO 0x03 /* I/O error */ 64#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */ 65#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ 66#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ 67#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */ 68#define SD_RES_NO_VDI 0x08 /* No vdi found */ 69#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */ 70#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */ 71#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */ 72#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ 73#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */ 74#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ 75#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ 76#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */ 77#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ 78#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ 79#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */ 80#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ 81#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ 82#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */ 83#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ 84#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ 85#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */ 86#define SD_RES_READONLY 0x1A /* Object is read-only */ 87 88/* 89 * Object ID rules 90 * 91 * 0 - 19 (20 bits): data object space 92 * 20 - 31 (12 bits): reserved data object space 93 * 32 - 55 (24 bits): vdi object space 94 * 56 - 59 ( 4 bits): reserved vdi object space 95 * 60 - 63 ( 4 bits): object type identifier space 96 */ 97 98#define VDI_SPACE_SHIFT 32 99#define VDI_BIT (UINT64_C(1) << 63) 100#define VMSTATE_BIT (UINT64_C(1) << 62) 101#define MAX_DATA_OBJS (UINT64_C(1) << 20) 102#define MAX_CHILDREN 1024 103#define SD_MAX_VDI_LEN 256 104#define SD_MAX_VDI_TAG_LEN 256 105#define SD_NR_VDIS (1U << 24) 106#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) 107#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) 108#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 109/* 110 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and 111 * (SD_EC_MAX_STRIP - 1) for parity strips 112 * 113 * SD_MAX_COPIES is sum of number of data strips and parity strips. 114 */ 115#define SD_EC_MAX_STRIP 16 116#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) 117 118#define SD_INODE_SIZE (sizeof(SheepdogInode)) 119#define CURRENT_VDI_ID 0 120 121#define LOCK_TYPE_NORMAL 0 122#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */ 123 124typedef struct SheepdogReq { 125 uint8_t proto_ver; 126 uint8_t opcode; 127 uint16_t flags; 128 uint32_t epoch; 129 uint32_t id; 130 uint32_t data_length; 131 uint32_t opcode_specific[8]; 132} SheepdogReq; 133 134typedef struct SheepdogRsp { 135 uint8_t proto_ver; 136 uint8_t opcode; 137 uint16_t flags; 138 uint32_t epoch; 139 uint32_t id; 140 uint32_t data_length; 141 uint32_t result; 142 uint32_t opcode_specific[7]; 143} SheepdogRsp; 144 145typedef struct SheepdogObjReq { 146 uint8_t proto_ver; 147 uint8_t opcode; 148 uint16_t flags; 149 uint32_t epoch; 150 uint32_t id; 151 uint32_t data_length; 152 uint64_t oid; 153 uint64_t cow_oid; 154 uint8_t copies; 155 uint8_t copy_policy; 156 uint8_t reserved[6]; 157 uint64_t offset; 158} SheepdogObjReq; 159 160typedef struct SheepdogObjRsp { 161 uint8_t proto_ver; 162 uint8_t opcode; 163 uint16_t flags; 164 uint32_t epoch; 165 uint32_t id; 166 uint32_t data_length; 167 uint32_t result; 168 uint8_t copies; 169 uint8_t copy_policy; 170 uint8_t reserved[2]; 171 uint32_t pad[6]; 172} SheepdogObjRsp; 173 174typedef struct SheepdogVdiReq { 175 uint8_t proto_ver; 176 uint8_t opcode; 177 uint16_t flags; 178 uint32_t epoch; 179 uint32_t id; 180 uint32_t data_length; 181 uint64_t vdi_size; 182 uint32_t base_vdi_id; 183 uint8_t copies; 184 uint8_t copy_policy; 185 uint8_t store_policy; 186 uint8_t block_size_shift; 187 uint32_t snapid; 188 uint32_t type; 189 uint32_t pad[2]; 190} SheepdogVdiReq; 191 192typedef struct SheepdogVdiRsp { 193 uint8_t proto_ver; 194 uint8_t opcode; 195 uint16_t flags; 196 uint32_t epoch; 197 uint32_t id; 198 uint32_t data_length; 199 uint32_t result; 200 uint32_t rsvd; 201 uint32_t vdi_id; 202 uint32_t pad[5]; 203} SheepdogVdiRsp; 204 205typedef struct SheepdogClusterRsp { 206 uint8_t proto_ver; 207 uint8_t opcode; 208 uint16_t flags; 209 uint32_t epoch; 210 uint32_t id; 211 uint32_t data_length; 212 uint32_t result; 213 uint8_t nr_copies; 214 uint8_t copy_policy; 215 uint8_t block_size_shift; 216 uint8_t __pad1; 217 uint32_t __pad2[6]; 218} SheepdogClusterRsp; 219 220typedef struct SheepdogInode { 221 char name[SD_MAX_VDI_LEN]; 222 char tag[SD_MAX_VDI_TAG_LEN]; 223 uint64_t ctime; 224 uint64_t snap_ctime; 225 uint64_t vm_clock_nsec; 226 uint64_t vdi_size; 227 uint64_t vm_state_size; 228 uint16_t copy_policy; 229 uint8_t nr_copies; 230 uint8_t block_size_shift; 231 uint32_t snap_id; 232 uint32_t vdi_id; 233 uint32_t parent_vdi_id; 234 uint32_t child_vdi_id[MAX_CHILDREN]; 235 uint32_t data_vdi_id[MAX_DATA_OBJS]; 236} SheepdogInode; 237 238#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id) 239 240/* 241 * 64 bit FNV-1a non-zero initial basis 242 */ 243#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) 244 245/* 246 * 64 bit Fowler/Noll/Vo FNV-1a hash code 247 */ 248static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) 249{ 250 unsigned char *bp = buf; 251 unsigned char *be = bp + len; 252 while (bp < be) { 253 hval ^= (uint64_t) *bp++; 254 hval += (hval << 1) + (hval << 4) + (hval << 5) + 255 (hval << 7) + (hval << 8) + (hval << 40); 256 } 257 return hval; 258} 259 260static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx) 261{ 262 return inode->vdi_id == inode->data_vdi_id[idx]; 263} 264 265static inline bool is_data_obj(uint64_t oid) 266{ 267 return !(VDI_BIT & oid); 268} 269 270static inline uint64_t data_oid_to_idx(uint64_t oid) 271{ 272 return oid & (MAX_DATA_OBJS - 1); 273} 274 275static inline uint32_t oid_to_vid(uint64_t oid) 276{ 277 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; 278} 279 280static inline uint64_t vid_to_vdi_oid(uint32_t vid) 281{ 282 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); 283} 284 285static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) 286{ 287 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; 288} 289 290static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) 291{ 292 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; 293} 294 295static inline bool is_snapshot(struct SheepdogInode *inode) 296{ 297 return !!inode->snap_ctime; 298} 299 300static inline size_t count_data_objs(const struct SheepdogInode *inode) 301{ 302 return DIV_ROUND_UP(inode->vdi_size, 303 (1UL << inode->block_size_shift)); 304} 305 306typedef struct SheepdogAIOCB SheepdogAIOCB; 307typedef struct BDRVSheepdogState BDRVSheepdogState; 308 309typedef struct AIOReq { 310 SheepdogAIOCB *aiocb; 311 unsigned int iov_offset; 312 313 uint64_t oid; 314 uint64_t base_oid; 315 uint64_t offset; 316 unsigned int data_len; 317 uint8_t flags; 318 uint32_t id; 319 bool create; 320 321 QLIST_ENTRY(AIOReq) aio_siblings; 322} AIOReq; 323 324enum AIOCBState { 325 AIOCB_WRITE_UDATA, 326 AIOCB_READ_UDATA, 327 AIOCB_FLUSH_CACHE, 328 AIOCB_DISCARD_OBJ, 329}; 330 331#define AIOCBOverlapping(x, y) \ 332 (!(x->max_affect_data_idx < y->min_affect_data_idx \ 333 || y->max_affect_data_idx < x->min_affect_data_idx)) 334 335struct SheepdogAIOCB { 336 BDRVSheepdogState *s; 337 338 QEMUIOVector *qiov; 339 340 int64_t sector_num; 341 int nb_sectors; 342 343 int ret; 344 enum AIOCBState aiocb_type; 345 346 Coroutine *coroutine; 347 int nr_pending; 348 349 uint32_t min_affect_data_idx; 350 uint32_t max_affect_data_idx; 351 352 /* 353 * The difference between affect_data_idx and dirty_data_idx: 354 * affect_data_idx represents range of index of all request types. 355 * dirty_data_idx represents range of index updated by COW requests. 356 * dirty_data_idx is used for updating an inode object. 357 */ 358 uint32_t min_dirty_data_idx; 359 uint32_t max_dirty_data_idx; 360 361 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; 362}; 363 364struct BDRVSheepdogState { 365 BlockDriverState *bs; 366 AioContext *aio_context; 367 368 SheepdogInode inode; 369 370 char name[SD_MAX_VDI_LEN]; 371 bool is_snapshot; 372 uint32_t cache_flags; 373 bool discard_supported; 374 375 SocketAddress *addr; 376 int fd; 377 378 CoMutex lock; 379 Coroutine *co_send; 380 Coroutine *co_recv; 381 382 uint32_t aioreq_seq_num; 383 384 /* Every aio request must be linked to either of these queues. */ 385 QLIST_HEAD(, AIOReq) inflight_aio_head; 386 QLIST_HEAD(, AIOReq) failed_aio_head; 387 388 CoMutex queue_lock; 389 CoQueue overlapping_queue; 390 QLIST_HEAD(, SheepdogAIOCB) inflight_aiocb_head; 391}; 392 393typedef struct BDRVSheepdogReopenState { 394 int fd; 395 int cache_flags; 396} BDRVSheepdogReopenState; 397 398static const char *sd_strerror(int err) 399{ 400 int i; 401 402 static const struct { 403 int err; 404 const char *desc; 405 } errors[] = { 406 {SD_RES_SUCCESS, "Success"}, 407 {SD_RES_UNKNOWN, "Unknown error"}, 408 {SD_RES_NO_OBJ, "No object found"}, 409 {SD_RES_EIO, "I/O error"}, 410 {SD_RES_VDI_EXIST, "VDI exists already"}, 411 {SD_RES_INVALID_PARMS, "Invalid parameters"}, 412 {SD_RES_SYSTEM_ERROR, "System error"}, 413 {SD_RES_VDI_LOCKED, "VDI is already locked"}, 414 {SD_RES_NO_VDI, "No vdi found"}, 415 {SD_RES_NO_BASE_VDI, "No base VDI found"}, 416 {SD_RES_VDI_READ, "Failed read the requested VDI"}, 417 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, 418 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, 419 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, 420 {SD_RES_NO_TAG, "Failed to find the requested tag"}, 421 {SD_RES_STARTUP, "The system is still booting"}, 422 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, 423 {SD_RES_SHUTDOWN, "The system is shutting down"}, 424 {SD_RES_NO_MEM, "Out of memory on the server"}, 425 {SD_RES_FULL_VDI, "We already have the maximum vdis"}, 426 {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, 427 {SD_RES_NO_SPACE, "Server has no space for new objects"}, 428 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, 429 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, 430 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, 431 {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, 432 {SD_RES_READONLY, "Object is read-only"}, 433 }; 434 435 for (i = 0; i < ARRAY_SIZE(errors); ++i) { 436 if (errors[i].err == err) { 437 return errors[i].desc; 438 } 439 } 440 441 return "Invalid error code"; 442} 443 444/* 445 * Sheepdog I/O handling: 446 * 447 * 1. In sd_co_rw_vector, we send the I/O requests to the server and 448 * link the requests to the inflight_list in the 449 * BDRVSheepdogState. The function yields while waiting for 450 * receiving the response. 451 * 452 * 2. We receive the response in aio_read_response, the fd handler to 453 * the sheepdog connection. We switch back to sd_co_readv/sd_writev 454 * after all the requests belonging to the AIOCB are finished. If 455 * needed, sd_co_writev will send another requests for the vdi object. 456 */ 457 458static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, 459 uint64_t oid, unsigned int data_len, 460 uint64_t offset, uint8_t flags, bool create, 461 uint64_t base_oid, unsigned int iov_offset) 462{ 463 AIOReq *aio_req; 464 465 aio_req = g_malloc(sizeof(*aio_req)); 466 aio_req->aiocb = acb; 467 aio_req->iov_offset = iov_offset; 468 aio_req->oid = oid; 469 aio_req->base_oid = base_oid; 470 aio_req->offset = offset; 471 aio_req->data_len = data_len; 472 aio_req->flags = flags; 473 aio_req->id = s->aioreq_seq_num++; 474 aio_req->create = create; 475 476 acb->nr_pending++; 477 return aio_req; 478} 479 480static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb) 481{ 482 SheepdogAIOCB *cb; 483 484retry: 485 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { 486 if (AIOCBOverlapping(acb, cb)) { 487 qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock); 488 goto retry; 489 } 490 } 491} 492 493static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s, 494 QEMUIOVector *qiov, int64_t sector_num, int nb_sectors, 495 int type) 496{ 497 uint32_t object_size; 498 499 object_size = (UINT32_C(1) << s->inode.block_size_shift); 500 501 acb->s = s; 502 503 acb->qiov = qiov; 504 505 acb->sector_num = sector_num; 506 acb->nb_sectors = nb_sectors; 507 508 acb->coroutine = qemu_coroutine_self(); 509 acb->ret = 0; 510 acb->nr_pending = 0; 511 512 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; 513 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + 514 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; 515 516 acb->min_dirty_data_idx = UINT32_MAX; 517 acb->max_dirty_data_idx = 0; 518 acb->aiocb_type = type; 519 520 if (type == AIOCB_FLUSH_CACHE) { 521 return; 522 } 523 524 qemu_co_mutex_lock(&s->queue_lock); 525 wait_for_overlapping_aiocb(s, acb); 526 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings); 527 qemu_co_mutex_unlock(&s->queue_lock); 528} 529 530static SocketAddress *sd_server_config(QDict *options, Error **errp) 531{ 532 QDict *server = NULL; 533 Visitor *iv = NULL; 534 SocketAddress *saddr = NULL; 535 536 qdict_extract_subqdict(options, &server, "server."); 537 538 iv = qobject_input_visitor_new_flat_confused(server, errp); 539 if (!iv) { 540 goto done; 541 } 542 543 if (!visit_type_SocketAddress(iv, NULL, &saddr, errp)) { 544 goto done; 545 } 546 547done: 548 visit_free(iv); 549 qobject_unref(server); 550 return saddr; 551} 552 553/* Return -EIO in case of error, file descriptor on success */ 554static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) 555{ 556 int fd; 557 558 fd = socket_connect(s->addr, errp); 559 560 if (s->addr->type == SOCKET_ADDRESS_TYPE_INET && fd >= 0) { 561 int ret = socket_set_nodelay(fd); 562 if (ret < 0) { 563 warn_report("can't set TCP_NODELAY: %s", strerror(errno)); 564 } 565 } 566 567 if (fd >= 0) { 568 qemu_set_nonblock(fd); 569 } else { 570 fd = -EIO; 571 } 572 573 return fd; 574} 575 576/* Return 0 on success and -errno in case of error */ 577static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, 578 unsigned int *wlen) 579{ 580 int ret; 581 582 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); 583 if (ret != sizeof(*hdr)) { 584 error_report("failed to send a req, %s", strerror(errno)); 585 return -errno; 586 } 587 588 ret = qemu_co_send(sockfd, data, *wlen); 589 if (ret != *wlen) { 590 error_report("failed to send a req, %s", strerror(errno)); 591 return -errno; 592 } 593 594 return ret; 595} 596 597typedef struct SheepdogReqCo { 598 int sockfd; 599 BlockDriverState *bs; 600 AioContext *aio_context; 601 SheepdogReq *hdr; 602 void *data; 603 unsigned int *wlen; 604 unsigned int *rlen; 605 int ret; 606 bool finished; 607 Coroutine *co; 608} SheepdogReqCo; 609 610static void restart_co_req(void *opaque) 611{ 612 SheepdogReqCo *srco = opaque; 613 614 aio_co_wake(srco->co); 615} 616 617static coroutine_fn void do_co_req(void *opaque) 618{ 619 int ret; 620 SheepdogReqCo *srco = opaque; 621 int sockfd = srco->sockfd; 622 SheepdogReq *hdr = srco->hdr; 623 void *data = srco->data; 624 unsigned int *wlen = srco->wlen; 625 unsigned int *rlen = srco->rlen; 626 627 srco->co = qemu_coroutine_self(); 628 aio_set_fd_handler(srco->aio_context, sockfd, false, 629 NULL, restart_co_req, NULL, srco); 630 631 ret = send_co_req(sockfd, hdr, data, wlen); 632 if (ret < 0) { 633 goto out; 634 } 635 636 aio_set_fd_handler(srco->aio_context, sockfd, false, 637 restart_co_req, NULL, NULL, srco); 638 639 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); 640 if (ret != sizeof(*hdr)) { 641 error_report("failed to get a rsp, %s", strerror(errno)); 642 ret = -errno; 643 goto out; 644 } 645 646 if (*rlen > hdr->data_length) { 647 *rlen = hdr->data_length; 648 } 649 650 if (*rlen) { 651 ret = qemu_co_recv(sockfd, data, *rlen); 652 if (ret != *rlen) { 653 error_report("failed to get the data, %s", strerror(errno)); 654 ret = -errno; 655 goto out; 656 } 657 } 658 ret = 0; 659out: 660 /* there is at most one request for this sockfd, so it is safe to 661 * set each handler to NULL. */ 662 aio_set_fd_handler(srco->aio_context, sockfd, false, 663 NULL, NULL, NULL, NULL); 664 665 srco->co = NULL; 666 srco->ret = ret; 667 /* Set srco->finished before reading bs->wakeup. */ 668 atomic_mb_set(&srco->finished, true); 669 if (srco->bs) { 670 bdrv_wakeup(srco->bs); 671 } 672} 673 674/* 675 * Send the request to the sheep in a synchronous manner. 676 * 677 * Return 0 on success, -errno in case of error. 678 */ 679static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr, 680 void *data, unsigned int *wlen, unsigned int *rlen) 681{ 682 Coroutine *co; 683 SheepdogReqCo srco = { 684 .sockfd = sockfd, 685 .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(), 686 .bs = bs, 687 .hdr = hdr, 688 .data = data, 689 .wlen = wlen, 690 .rlen = rlen, 691 .ret = 0, 692 .finished = false, 693 }; 694 695 if (qemu_in_coroutine()) { 696 do_co_req(&srco); 697 } else { 698 co = qemu_coroutine_create(do_co_req, &srco); 699 if (bs) { 700 bdrv_coroutine_enter(bs, co); 701 BDRV_POLL_WHILE(bs, !srco.finished); 702 } else { 703 qemu_coroutine_enter(co); 704 while (!srco.finished) { 705 aio_poll(qemu_get_aio_context(), true); 706 } 707 } 708 } 709 710 return srco.ret; 711} 712 713static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, 714 struct iovec *iov, int niov, 715 enum AIOCBState aiocb_type); 716static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); 717static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); 718static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); 719static void co_write_request(void *opaque); 720 721static coroutine_fn void reconnect_to_sdog(void *opaque) 722{ 723 BDRVSheepdogState *s = opaque; 724 AIOReq *aio_req, *next; 725 726 aio_set_fd_handler(s->aio_context, s->fd, false, NULL, 727 NULL, NULL, NULL); 728 close(s->fd); 729 s->fd = -1; 730 731 /* Wait for outstanding write requests to be completed. */ 732 while (s->co_send != NULL) { 733 co_write_request(opaque); 734 } 735 736 /* Try to reconnect the sheepdog server every one second. */ 737 while (s->fd < 0) { 738 Error *local_err = NULL; 739 s->fd = get_sheep_fd(s, &local_err); 740 if (s->fd < 0) { 741 trace_sheepdog_reconnect_to_sdog(); 742 error_report_err(local_err); 743 qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000000ULL); 744 } 745 }; 746 747 /* 748 * Now we have to resend all the request in the inflight queue. However, 749 * resend_aioreq() can yield and newly created requests can be added to the 750 * inflight queue before the coroutine is resumed. To avoid mixing them, we 751 * have to move all the inflight requests to the failed queue before 752 * resend_aioreq() is called. 753 */ 754 qemu_co_mutex_lock(&s->queue_lock); 755 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { 756 QLIST_REMOVE(aio_req, aio_siblings); 757 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); 758 } 759 760 /* Resend all the failed aio requests. */ 761 while (!QLIST_EMPTY(&s->failed_aio_head)) { 762 aio_req = QLIST_FIRST(&s->failed_aio_head); 763 QLIST_REMOVE(aio_req, aio_siblings); 764 qemu_co_mutex_unlock(&s->queue_lock); 765 resend_aioreq(s, aio_req); 766 qemu_co_mutex_lock(&s->queue_lock); 767 } 768 qemu_co_mutex_unlock(&s->queue_lock); 769} 770 771/* 772 * Receive responses of the I/O requests. 773 * 774 * This function is registered as a fd handler, and called from the 775 * main loop when s->fd is ready for reading responses. 776 */ 777static void coroutine_fn aio_read_response(void *opaque) 778{ 779 SheepdogObjRsp rsp; 780 BDRVSheepdogState *s = opaque; 781 int fd = s->fd; 782 int ret; 783 AIOReq *aio_req = NULL; 784 SheepdogAIOCB *acb; 785 uint64_t idx; 786 787 /* read a header */ 788 ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); 789 if (ret != sizeof(rsp)) { 790 error_report("failed to get the header, %s", strerror(errno)); 791 goto err; 792 } 793 794 /* find the right aio_req from the inflight aio list */ 795 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { 796 if (aio_req->id == rsp.id) { 797 break; 798 } 799 } 800 if (!aio_req) { 801 error_report("cannot find aio_req %x", rsp.id); 802 goto err; 803 } 804 805 acb = aio_req->aiocb; 806 807 switch (acb->aiocb_type) { 808 case AIOCB_WRITE_UDATA: 809 if (!is_data_obj(aio_req->oid)) { 810 break; 811 } 812 idx = data_oid_to_idx(aio_req->oid); 813 814 if (aio_req->create) { 815 /* 816 * If the object is newly created one, we need to update 817 * the vdi object (metadata object). min_dirty_data_idx 818 * and max_dirty_data_idx are changed to include updated 819 * index between them. 820 */ 821 if (rsp.result == SD_RES_SUCCESS) { 822 s->inode.data_vdi_id[idx] = s->inode.vdi_id; 823 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx); 824 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx); 825 } 826 } 827 break; 828 case AIOCB_READ_UDATA: 829 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, 830 aio_req->iov_offset, rsp.data_length); 831 if (ret != rsp.data_length) { 832 error_report("failed to get the data, %s", strerror(errno)); 833 goto err; 834 } 835 break; 836 case AIOCB_FLUSH_CACHE: 837 if (rsp.result == SD_RES_INVALID_PARMS) { 838 trace_sheepdog_aio_read_response(); 839 s->cache_flags = SD_FLAG_CMD_DIRECT; 840 rsp.result = SD_RES_SUCCESS; 841 } 842 break; 843 case AIOCB_DISCARD_OBJ: 844 switch (rsp.result) { 845 case SD_RES_INVALID_PARMS: 846 error_report("server doesn't support discard command"); 847 rsp.result = SD_RES_SUCCESS; 848 s->discard_supported = false; 849 break; 850 default: 851 break; 852 } 853 } 854 855 /* No more data for this aio_req (reload_inode below uses its own file 856 * descriptor handler which doesn't use co_recv). 857 */ 858 s->co_recv = NULL; 859 860 qemu_co_mutex_lock(&s->queue_lock); 861 QLIST_REMOVE(aio_req, aio_siblings); 862 qemu_co_mutex_unlock(&s->queue_lock); 863 864 switch (rsp.result) { 865 case SD_RES_SUCCESS: 866 break; 867 case SD_RES_READONLY: 868 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { 869 ret = reload_inode(s, 0, ""); 870 if (ret < 0) { 871 goto err; 872 } 873 } 874 if (is_data_obj(aio_req->oid)) { 875 aio_req->oid = vid_to_data_oid(s->inode.vdi_id, 876 data_oid_to_idx(aio_req->oid)); 877 } else { 878 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); 879 } 880 resend_aioreq(s, aio_req); 881 return; 882 default: 883 acb->ret = -EIO; 884 error_report("%s", sd_strerror(rsp.result)); 885 break; 886 } 887 888 g_free(aio_req); 889 890 if (!--acb->nr_pending) { 891 /* 892 * We've finished all requests which belong to the AIOCB, so 893 * we can switch back to sd_co_readv/writev now. 894 */ 895 aio_co_wake(acb->coroutine); 896 } 897 898 return; 899 900err: 901 reconnect_to_sdog(opaque); 902} 903 904static void co_read_response(void *opaque) 905{ 906 BDRVSheepdogState *s = opaque; 907 908 if (!s->co_recv) { 909 s->co_recv = qemu_coroutine_create(aio_read_response, opaque); 910 } 911 912 aio_co_enter(s->aio_context, s->co_recv); 913} 914 915static void co_write_request(void *opaque) 916{ 917 BDRVSheepdogState *s = opaque; 918 919 aio_co_wake(s->co_send); 920} 921 922/* 923 * Return a socket descriptor to read/write objects. 924 * 925 * We cannot use this descriptor for other operations because 926 * the block driver may be on waiting response from the server. 927 */ 928static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) 929{ 930 int fd; 931 932 fd = connect_to_sdog(s, errp); 933 if (fd < 0) { 934 return fd; 935 } 936 937 aio_set_fd_handler(s->aio_context, fd, false, 938 co_read_response, NULL, NULL, s); 939 return fd; 940} 941 942/* 943 * Parse numeric snapshot ID in @str 944 * If @str can't be parsed as number, return false. 945 * Else, if the number is zero or too large, set *@snapid to zero and 946 * return true. 947 * Else, set *@snapid to the number and return true. 948 */ 949static bool sd_parse_snapid(const char *str, uint32_t *snapid) 950{ 951 unsigned long ul; 952 int ret; 953 954 ret = qemu_strtoul(str, NULL, 10, &ul); 955 if (ret == -ERANGE) { 956 ul = ret = 0; 957 } 958 if (ret) { 959 return false; 960 } 961 if (ul > UINT32_MAX) { 962 ul = 0; 963 } 964 965 *snapid = ul; 966 return true; 967} 968 969static bool sd_parse_snapid_or_tag(const char *str, 970 uint32_t *snapid, char tag[]) 971{ 972 if (!sd_parse_snapid(str, snapid)) { 973 *snapid = 0; 974 if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) { 975 return false; 976 } 977 } else if (!*snapid) { 978 return false; 979 } else { 980 tag[0] = 0; 981 } 982 return true; 983} 984 985typedef struct { 986 const char *path; /* non-null iff transport is tcp */ 987 const char *host; /* valid when transport is tcp */ 988 int port; /* valid when transport is tcp */ 989 char vdi[SD_MAX_VDI_LEN]; 990 char tag[SD_MAX_VDI_TAG_LEN]; 991 uint32_t snap_id; 992 /* Remainder is only for sd_config_done() */ 993 URI *uri; 994 QueryParams *qp; 995} SheepdogConfig; 996 997static void sd_config_done(SheepdogConfig *cfg) 998{ 999 if (cfg->qp) { 1000 query_params_free(cfg->qp); 1001 } 1002 uri_free(cfg->uri); 1003} 1004 1005static void sd_parse_uri(SheepdogConfig *cfg, const char *filename, 1006 Error **errp) 1007{ 1008 Error *err = NULL; 1009 QueryParams *qp = NULL; 1010 bool is_unix; 1011 URI *uri; 1012 1013 memset(cfg, 0, sizeof(*cfg)); 1014 1015 cfg->uri = uri = uri_parse(filename); 1016 if (!uri) { 1017 error_setg(&err, "invalid URI '%s'", filename); 1018 goto out; 1019 } 1020 1021 /* transport */ 1022 if (!g_strcmp0(uri->scheme, "sheepdog")) { 1023 is_unix = false; 1024 } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) { 1025 is_unix = false; 1026 } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) { 1027 is_unix = true; 1028 } else { 1029 error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp'," 1030 " or 'sheepdog+unix'"); 1031 goto out; 1032 } 1033 1034 if (uri->path == NULL || !strcmp(uri->path, "/")) { 1035 error_setg(&err, "missing file path in URI"); 1036 goto out; 1037 } 1038 if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN) 1039 >= SD_MAX_VDI_LEN) { 1040 error_setg(&err, "VDI name is too long"); 1041 goto out; 1042 } 1043 1044 cfg->qp = qp = query_params_parse(uri->query); 1045 1046 if (is_unix) { 1047 /* sheepdog+unix:///vdiname?socket=path */ 1048 if (uri->server || uri->port) { 1049 error_setg(&err, "URI scheme %s doesn't accept a server address", 1050 uri->scheme); 1051 goto out; 1052 } 1053 if (!qp->n) { 1054 error_setg(&err, 1055 "URI scheme %s requires query parameter 'socket'", 1056 uri->scheme); 1057 goto out; 1058 } 1059 if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) { 1060 error_setg(&err, "unexpected query parameters"); 1061 goto out; 1062 } 1063 cfg->path = qp->p[0].value; 1064 } else { 1065 /* sheepdog[+tcp]://[host:port]/vdiname */ 1066 if (qp->n) { 1067 error_setg(&err, "unexpected query parameters"); 1068 goto out; 1069 } 1070 cfg->host = uri->server; 1071 cfg->port = uri->port; 1072 } 1073 1074 /* snapshot tag */ 1075 if (uri->fragment) { 1076 if (!sd_parse_snapid_or_tag(uri->fragment, 1077 &cfg->snap_id, cfg->tag)) { 1078 error_setg(&err, "'%s' is not a valid snapshot ID", 1079 uri->fragment); 1080 goto out; 1081 } 1082 } else { 1083 cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */ 1084 } 1085 1086out: 1087 if (err) { 1088 error_propagate(errp, err); 1089 sd_config_done(cfg); 1090 } 1091} 1092 1093/* 1094 * Parse a filename (old syntax) 1095 * 1096 * filename must be one of the following formats: 1097 * 1. [vdiname] 1098 * 2. [vdiname]:[snapid] 1099 * 3. [vdiname]:[tag] 1100 * 4. [hostname]:[port]:[vdiname] 1101 * 5. [hostname]:[port]:[vdiname]:[snapid] 1102 * 6. [hostname]:[port]:[vdiname]:[tag] 1103 * 1104 * You can boot from the snapshot images by specifying `snapid` or 1105 * `tag'. 1106 * 1107 * You can run VMs outside the Sheepdog cluster by specifying 1108 * `hostname' and `port' (experimental). 1109 */ 1110static void parse_vdiname(SheepdogConfig *cfg, const char *filename, 1111 Error **errp) 1112{ 1113 Error *err = NULL; 1114 char *p, *q, *uri; 1115 const char *host_spec, *vdi_spec; 1116 int nr_sep; 1117 1118 strstart(filename, "sheepdog:", &filename); 1119 p = q = g_strdup(filename); 1120 1121 /* count the number of separators */ 1122 nr_sep = 0; 1123 while (*p) { 1124 if (*p == ':') { 1125 nr_sep++; 1126 } 1127 p++; 1128 } 1129 p = q; 1130 1131 /* use the first two tokens as host_spec. */ 1132 if (nr_sep >= 2) { 1133 host_spec = p; 1134 p = strchr(p, ':'); 1135 p++; 1136 p = strchr(p, ':'); 1137 *p++ = '\0'; 1138 } else { 1139 host_spec = ""; 1140 } 1141 1142 vdi_spec = p; 1143 1144 p = strchr(vdi_spec, ':'); 1145 if (p) { 1146 *p++ = '#'; 1147 } 1148 1149 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec); 1150 1151 /* 1152 * FIXME We to escape URI meta-characters, e.g. "x?y=z" 1153 * produces "sheepdog://x?y=z". Because of that ... 1154 */ 1155 sd_parse_uri(cfg, uri, &err); 1156 if (err) { 1157 /* 1158 * ... this can fail, but the error message is misleading. 1159 * Replace it by the traditional useless one until the 1160 * escaping is fixed. 1161 */ 1162 error_free(err); 1163 error_setg(errp, "Can't parse filename"); 1164 } 1165 1166 g_free(q); 1167 g_free(uri); 1168} 1169 1170static void sd_parse_filename(const char *filename, QDict *options, 1171 Error **errp) 1172{ 1173 Error *err = NULL; 1174 SheepdogConfig cfg; 1175 char buf[32]; 1176 1177 if (strstr(filename, "://")) { 1178 sd_parse_uri(&cfg, filename, &err); 1179 } else { 1180 parse_vdiname(&cfg, filename, &err); 1181 } 1182 if (err) { 1183 error_propagate(errp, err); 1184 return; 1185 } 1186 1187 if (cfg.path) { 1188 qdict_set_default_str(options, "server.path", cfg.path); 1189 qdict_set_default_str(options, "server.type", "unix"); 1190 } else { 1191 qdict_set_default_str(options, "server.type", "inet"); 1192 qdict_set_default_str(options, "server.host", 1193 cfg.host ?: SD_DEFAULT_ADDR); 1194 snprintf(buf, sizeof(buf), "%d", cfg.port ?: SD_DEFAULT_PORT); 1195 qdict_set_default_str(options, "server.port", buf); 1196 } 1197 qdict_set_default_str(options, "vdi", cfg.vdi); 1198 qdict_set_default_str(options, "tag", cfg.tag); 1199 if (cfg.snap_id) { 1200 snprintf(buf, sizeof(buf), "%d", cfg.snap_id); 1201 qdict_set_default_str(options, "snap-id", buf); 1202 } 1203 1204 sd_config_done(&cfg); 1205} 1206 1207static int find_vdi_name(BDRVSheepdogState *s, const char *filename, 1208 uint32_t snapid, const char *tag, uint32_t *vid, 1209 bool lock, Error **errp) 1210{ 1211 int ret, fd; 1212 SheepdogVdiReq hdr; 1213 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; 1214 unsigned int wlen, rlen = 0; 1215 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN] QEMU_NONSTRING; 1216 1217 fd = connect_to_sdog(s, errp); 1218 if (fd < 0) { 1219 return fd; 1220 } 1221 1222 /* This pair of strncpy calls ensures that the buffer is zero-filled, 1223 * which is desirable since we'll soon be sending those bytes, and 1224 * don't want the send_req to read uninitialized data. 1225 */ 1226 strncpy(buf, filename, SD_MAX_VDI_LEN); 1227 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); 1228 1229 memset(&hdr, 0, sizeof(hdr)); 1230 if (lock) { 1231 hdr.opcode = SD_OP_LOCK_VDI; 1232 hdr.type = LOCK_TYPE_NORMAL; 1233 } else { 1234 hdr.opcode = SD_OP_GET_VDI_INFO; 1235 } 1236 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; 1237 hdr.proto_ver = SD_PROTO_VER; 1238 hdr.data_length = wlen; 1239 hdr.snapid = snapid; 1240 hdr.flags = SD_FLAG_CMD_WRITE; 1241 1242 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen); 1243 if (ret) { 1244 error_setg_errno(errp, -ret, "cannot get vdi info"); 1245 goto out; 1246 } 1247 1248 if (rsp->result != SD_RES_SUCCESS) { 1249 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s", 1250 sd_strerror(rsp->result), filename, snapid, tag); 1251 if (rsp->result == SD_RES_NO_VDI) { 1252 ret = -ENOENT; 1253 } else if (rsp->result == SD_RES_VDI_LOCKED) { 1254 ret = -EBUSY; 1255 } else { 1256 ret = -EIO; 1257 } 1258 goto out; 1259 } 1260 *vid = rsp->vdi_id; 1261 1262 ret = 0; 1263out: 1264 closesocket(fd); 1265 return ret; 1266} 1267 1268static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, 1269 struct iovec *iov, int niov, 1270 enum AIOCBState aiocb_type) 1271{ 1272 int nr_copies = s->inode.nr_copies; 1273 SheepdogObjReq hdr; 1274 unsigned int wlen = 0; 1275 int ret; 1276 uint64_t oid = aio_req->oid; 1277 unsigned int datalen = aio_req->data_len; 1278 uint64_t offset = aio_req->offset; 1279 uint8_t flags = aio_req->flags; 1280 uint64_t old_oid = aio_req->base_oid; 1281 bool create = aio_req->create; 1282 1283 qemu_co_mutex_lock(&s->queue_lock); 1284 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); 1285 qemu_co_mutex_unlock(&s->queue_lock); 1286 1287 if (!nr_copies) { 1288 error_report("bug"); 1289 } 1290 1291 memset(&hdr, 0, sizeof(hdr)); 1292 1293 switch (aiocb_type) { 1294 case AIOCB_FLUSH_CACHE: 1295 hdr.opcode = SD_OP_FLUSH_VDI; 1296 break; 1297 case AIOCB_READ_UDATA: 1298 hdr.opcode = SD_OP_READ_OBJ; 1299 hdr.flags = flags; 1300 break; 1301 case AIOCB_WRITE_UDATA: 1302 if (create) { 1303 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; 1304 } else { 1305 hdr.opcode = SD_OP_WRITE_OBJ; 1306 } 1307 wlen = datalen; 1308 hdr.flags = SD_FLAG_CMD_WRITE | flags; 1309 break; 1310 case AIOCB_DISCARD_OBJ: 1311 hdr.opcode = SD_OP_WRITE_OBJ; 1312 hdr.flags = SD_FLAG_CMD_WRITE | flags; 1313 s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0; 1314 offset = offsetof(SheepdogInode, 1315 data_vdi_id[data_oid_to_idx(oid)]); 1316 oid = vid_to_vdi_oid(s->inode.vdi_id); 1317 wlen = datalen = sizeof(uint32_t); 1318 break; 1319 } 1320 1321 if (s->cache_flags) { 1322 hdr.flags |= s->cache_flags; 1323 } 1324 1325 hdr.oid = oid; 1326 hdr.cow_oid = old_oid; 1327 hdr.copies = s->inode.nr_copies; 1328 1329 hdr.data_length = datalen; 1330 hdr.offset = offset; 1331 1332 hdr.id = aio_req->id; 1333 1334 qemu_co_mutex_lock(&s->lock); 1335 s->co_send = qemu_coroutine_self(); 1336 aio_set_fd_handler(s->aio_context, s->fd, false, 1337 co_read_response, co_write_request, NULL, s); 1338 socket_set_cork(s->fd, 1); 1339 1340 /* send a header */ 1341 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); 1342 if (ret != sizeof(hdr)) { 1343 error_report("failed to send a req, %s", strerror(errno)); 1344 goto out; 1345 } 1346 1347 if (wlen) { 1348 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); 1349 if (ret != wlen) { 1350 error_report("failed to send a data, %s", strerror(errno)); 1351 } 1352 } 1353out: 1354 socket_set_cork(s->fd, 0); 1355 aio_set_fd_handler(s->aio_context, s->fd, false, 1356 co_read_response, NULL, NULL, s); 1357 s->co_send = NULL; 1358 qemu_co_mutex_unlock(&s->lock); 1359} 1360 1361static int read_write_object(int fd, BlockDriverState *bs, char *buf, 1362 uint64_t oid, uint8_t copies, 1363 unsigned int datalen, uint64_t offset, 1364 bool write, bool create, uint32_t cache_flags) 1365{ 1366 SheepdogObjReq hdr; 1367 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; 1368 unsigned int wlen, rlen; 1369 int ret; 1370 1371 memset(&hdr, 0, sizeof(hdr)); 1372 1373 if (write) { 1374 wlen = datalen; 1375 rlen = 0; 1376 hdr.flags = SD_FLAG_CMD_WRITE; 1377 if (create) { 1378 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; 1379 } else { 1380 hdr.opcode = SD_OP_WRITE_OBJ; 1381 } 1382 } else { 1383 wlen = 0; 1384 rlen = datalen; 1385 hdr.opcode = SD_OP_READ_OBJ; 1386 } 1387 1388 hdr.flags |= cache_flags; 1389 1390 hdr.oid = oid; 1391 hdr.data_length = datalen; 1392 hdr.offset = offset; 1393 hdr.copies = copies; 1394 1395 ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen); 1396 if (ret) { 1397 error_report("failed to send a request to the sheep"); 1398 return ret; 1399 } 1400 1401 switch (rsp->result) { 1402 case SD_RES_SUCCESS: 1403 return 0; 1404 default: 1405 error_report("%s", sd_strerror(rsp->result)); 1406 return -EIO; 1407 } 1408} 1409 1410static int read_object(int fd, BlockDriverState *bs, char *buf, 1411 uint64_t oid, uint8_t copies, 1412 unsigned int datalen, uint64_t offset, 1413 uint32_t cache_flags) 1414{ 1415 return read_write_object(fd, bs, buf, oid, copies, 1416 datalen, offset, false, 1417 false, cache_flags); 1418} 1419 1420static int write_object(int fd, BlockDriverState *bs, char *buf, 1421 uint64_t oid, uint8_t copies, 1422 unsigned int datalen, uint64_t offset, bool create, 1423 uint32_t cache_flags) 1424{ 1425 return read_write_object(fd, bs, buf, oid, copies, 1426 datalen, offset, true, 1427 create, cache_flags); 1428} 1429 1430/* update inode with the latest state */ 1431static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) 1432{ 1433 Error *local_err = NULL; 1434 SheepdogInode *inode; 1435 int ret = 0, fd; 1436 uint32_t vid = 0; 1437 1438 fd = connect_to_sdog(s, &local_err); 1439 if (fd < 0) { 1440 error_report_err(local_err); 1441 return -EIO; 1442 } 1443 1444 inode = g_malloc(SD_INODE_HEADER_SIZE); 1445 1446 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err); 1447 if (ret) { 1448 error_report_err(local_err); 1449 goto out; 1450 } 1451 1452 ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid), 1453 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, 1454 s->cache_flags); 1455 if (ret < 0) { 1456 goto out; 1457 } 1458 1459 if (inode->vdi_id != s->inode.vdi_id) { 1460 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE); 1461 } 1462 1463out: 1464 g_free(inode); 1465 closesocket(fd); 1466 1467 return ret; 1468} 1469 1470static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) 1471{ 1472 SheepdogAIOCB *acb = aio_req->aiocb; 1473 1474 aio_req->create = false; 1475 1476 /* check whether this request becomes a CoW one */ 1477 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { 1478 int idx = data_oid_to_idx(aio_req->oid); 1479 1480 if (is_data_obj_writable(&s->inode, idx)) { 1481 goto out; 1482 } 1483 1484 if (s->inode.data_vdi_id[idx]) { 1485 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); 1486 aio_req->flags |= SD_FLAG_CMD_COW; 1487 } 1488 aio_req->create = true; 1489 } 1490out: 1491 if (is_data_obj(aio_req->oid)) { 1492 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, 1493 acb->aiocb_type); 1494 } else { 1495 struct iovec iov; 1496 iov.iov_base = &s->inode; 1497 iov.iov_len = sizeof(s->inode); 1498 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); 1499 } 1500} 1501 1502static void sd_detach_aio_context(BlockDriverState *bs) 1503{ 1504 BDRVSheepdogState *s = bs->opaque; 1505 1506 aio_set_fd_handler(s->aio_context, s->fd, false, NULL, 1507 NULL, NULL, NULL); 1508} 1509 1510static void sd_attach_aio_context(BlockDriverState *bs, 1511 AioContext *new_context) 1512{ 1513 BDRVSheepdogState *s = bs->opaque; 1514 1515 s->aio_context = new_context; 1516 aio_set_fd_handler(new_context, s->fd, false, 1517 co_read_response, NULL, NULL, s); 1518} 1519 1520static QemuOptsList runtime_opts = { 1521 .name = "sheepdog", 1522 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 1523 .desc = { 1524 { 1525 .name = "vdi", 1526 .type = QEMU_OPT_STRING, 1527 }, 1528 { 1529 .name = "snap-id", 1530 .type = QEMU_OPT_NUMBER, 1531 }, 1532 { 1533 .name = "tag", 1534 .type = QEMU_OPT_STRING, 1535 }, 1536 { /* end of list */ } 1537 }, 1538}; 1539 1540static int sd_open(BlockDriverState *bs, QDict *options, int flags, 1541 Error **errp) 1542{ 1543 int ret, fd; 1544 uint32_t vid = 0; 1545 BDRVSheepdogState *s = bs->opaque; 1546 const char *vdi, *snap_id_str, *tag; 1547 uint64_t snap_id; 1548 char *buf = NULL; 1549 QemuOpts *opts; 1550 1551 s->bs = bs; 1552 s->aio_context = bdrv_get_aio_context(bs); 1553 1554 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 1555 if (!qemu_opts_absorb_qdict(opts, options, errp)) { 1556 ret = -EINVAL; 1557 goto err_no_fd; 1558 } 1559 1560 s->addr = sd_server_config(options, errp); 1561 if (!s->addr) { 1562 ret = -EINVAL; 1563 goto err_no_fd; 1564 } 1565 1566 vdi = qemu_opt_get(opts, "vdi"); 1567 snap_id_str = qemu_opt_get(opts, "snap-id"); 1568 snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID); 1569 tag = qemu_opt_get(opts, "tag"); 1570 1571 if (!vdi) { 1572 error_setg(errp, "parameter 'vdi' is missing"); 1573 ret = -EINVAL; 1574 goto err_no_fd; 1575 } 1576 if (strlen(vdi) >= SD_MAX_VDI_LEN) { 1577 error_setg(errp, "value of parameter 'vdi' is too long"); 1578 ret = -EINVAL; 1579 goto err_no_fd; 1580 } 1581 1582 if (snap_id > UINT32_MAX) { 1583 snap_id = 0; 1584 } 1585 if (snap_id_str && !snap_id) { 1586 error_setg(errp, "'snap-id=%s' is not a valid snapshot ID", 1587 snap_id_str); 1588 ret = -EINVAL; 1589 goto err_no_fd; 1590 } 1591 1592 if (!tag) { 1593 tag = ""; 1594 } 1595 if (strlen(tag) >= SD_MAX_VDI_TAG_LEN) { 1596 error_setg(errp, "value of parameter 'tag' is too long"); 1597 ret = -EINVAL; 1598 goto err_no_fd; 1599 } 1600 1601 QLIST_INIT(&s->inflight_aio_head); 1602 QLIST_INIT(&s->failed_aio_head); 1603 QLIST_INIT(&s->inflight_aiocb_head); 1604 1605 s->fd = get_sheep_fd(s, errp); 1606 if (s->fd < 0) { 1607 ret = s->fd; 1608 goto err_no_fd; 1609 } 1610 1611 ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp); 1612 if (ret) { 1613 goto err; 1614 } 1615 1616 /* 1617 * QEMU block layer emulates writethrough cache as 'writeback + flush', so 1618 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default. 1619 */ 1620 s->cache_flags = SD_FLAG_CMD_CACHE; 1621 if (flags & BDRV_O_NOCACHE) { 1622 s->cache_flags = SD_FLAG_CMD_DIRECT; 1623 } 1624 s->discard_supported = true; 1625 1626 if (snap_id || tag[0]) { 1627 trace_sheepdog_open(vid); 1628 s->is_snapshot = true; 1629 } 1630 1631 fd = connect_to_sdog(s, errp); 1632 if (fd < 0) { 1633 ret = fd; 1634 goto err; 1635 } 1636 1637 buf = g_malloc(SD_INODE_SIZE); 1638 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid), 1639 0, SD_INODE_SIZE, 0, s->cache_flags); 1640 1641 closesocket(fd); 1642 1643 if (ret) { 1644 error_setg(errp, "Can't read snapshot inode"); 1645 goto err; 1646 } 1647 1648 memcpy(&s->inode, buf, sizeof(s->inode)); 1649 1650 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; 1651 bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; 1652 pstrcpy(s->name, sizeof(s->name), vdi); 1653 qemu_co_mutex_init(&s->lock); 1654 qemu_co_mutex_init(&s->queue_lock); 1655 qemu_co_queue_init(&s->overlapping_queue); 1656 qemu_opts_del(opts); 1657 g_free(buf); 1658 return 0; 1659 1660err: 1661 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, 1662 false, NULL, NULL, NULL, NULL); 1663 closesocket(s->fd); 1664err_no_fd: 1665 qemu_opts_del(opts); 1666 g_free(buf); 1667 return ret; 1668} 1669 1670static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, 1671 Error **errp) 1672{ 1673 BDRVSheepdogState *s = state->bs->opaque; 1674 BDRVSheepdogReopenState *re_s; 1675 int ret = 0; 1676 1677 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1); 1678 1679 re_s->cache_flags = SD_FLAG_CMD_CACHE; 1680 if (state->flags & BDRV_O_NOCACHE) { 1681 re_s->cache_flags = SD_FLAG_CMD_DIRECT; 1682 } 1683 1684 re_s->fd = get_sheep_fd(s, errp); 1685 if (re_s->fd < 0) { 1686 ret = re_s->fd; 1687 return ret; 1688 } 1689 1690 return ret; 1691} 1692 1693static void sd_reopen_commit(BDRVReopenState *state) 1694{ 1695 BDRVSheepdogReopenState *re_s = state->opaque; 1696 BDRVSheepdogState *s = state->bs->opaque; 1697 1698 if (s->fd) { 1699 aio_set_fd_handler(s->aio_context, s->fd, false, 1700 NULL, NULL, NULL, NULL); 1701 closesocket(s->fd); 1702 } 1703 1704 s->fd = re_s->fd; 1705 s->cache_flags = re_s->cache_flags; 1706 1707 g_free(state->opaque); 1708 state->opaque = NULL; 1709 1710 return; 1711} 1712 1713static void sd_reopen_abort(BDRVReopenState *state) 1714{ 1715 BDRVSheepdogReopenState *re_s = state->opaque; 1716 BDRVSheepdogState *s = state->bs->opaque; 1717 1718 if (re_s == NULL) { 1719 return; 1720 } 1721 1722 if (re_s->fd) { 1723 aio_set_fd_handler(s->aio_context, re_s->fd, false, 1724 NULL, NULL, NULL, NULL); 1725 closesocket(re_s->fd); 1726 } 1727 1728 g_free(state->opaque); 1729 state->opaque = NULL; 1730 1731 return; 1732} 1733 1734static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, 1735 Error **errp) 1736{ 1737 SheepdogVdiReq hdr; 1738 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; 1739 int fd, ret; 1740 unsigned int wlen, rlen = 0; 1741 char buf[SD_MAX_VDI_LEN]; 1742 1743 fd = connect_to_sdog(s, errp); 1744 if (fd < 0) { 1745 return fd; 1746 } 1747 1748 /* FIXME: would it be better to fail (e.g., return -EIO) when filename 1749 * does not fit in buf? For now, just truncate and avoid buffer overrun. 1750 */ 1751 memset(buf, 0, sizeof(buf)); 1752 pstrcpy(buf, sizeof(buf), s->name); 1753 1754 memset(&hdr, 0, sizeof(hdr)); 1755 hdr.opcode = SD_OP_NEW_VDI; 1756 hdr.base_vdi_id = s->inode.vdi_id; 1757 1758 wlen = SD_MAX_VDI_LEN; 1759 1760 hdr.flags = SD_FLAG_CMD_WRITE; 1761 hdr.snapid = snapshot; 1762 1763 hdr.data_length = wlen; 1764 hdr.vdi_size = s->inode.vdi_size; 1765 hdr.copy_policy = s->inode.copy_policy; 1766 hdr.copies = s->inode.nr_copies; 1767 hdr.block_size_shift = s->inode.block_size_shift; 1768 1769 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen); 1770 1771 closesocket(fd); 1772 1773 if (ret) { 1774 error_setg_errno(errp, -ret, "create failed"); 1775 return ret; 1776 } 1777 1778 if (rsp->result != SD_RES_SUCCESS) { 1779 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name); 1780 return -EIO; 1781 } 1782 1783 if (vdi_id) { 1784 *vdi_id = rsp->vdi_id; 1785 } 1786 1787 return 0; 1788} 1789 1790static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size, 1791 Error **errp) 1792{ 1793 BlockBackend *blk = NULL; 1794 BDRVSheepdogState *base = bs->opaque; 1795 unsigned long buf_size; 1796 uint32_t idx, max_idx; 1797 uint32_t object_size; 1798 void *buf = NULL; 1799 int ret; 1800 1801 blk = blk_new_with_bs(bs, 1802 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE, 1803 BLK_PERM_ALL, errp); 1804 1805 if (!blk) { 1806 ret = -EPERM; 1807 goto out_with_err_set; 1808 } 1809 1810 blk_set_allow_write_beyond_eof(blk, true); 1811 1812 object_size = (UINT32_C(1) << base->inode.block_size_shift); 1813 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); 1814 buf = g_malloc0(buf_size); 1815 1816 max_idx = DIV_ROUND_UP(new_size, buf_size); 1817 1818 for (idx = old_size / buf_size; idx < max_idx; idx++) { 1819 /* 1820 * The created image can be a cloned image, so we need to read 1821 * a data from the source image. 1822 */ 1823 ret = blk_pread(blk, idx * buf_size, buf, buf_size); 1824 if (ret < 0) { 1825 goto out; 1826 } 1827 ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0); 1828 if (ret < 0) { 1829 goto out; 1830 } 1831 } 1832 1833 ret = 0; 1834out: 1835 if (ret < 0) { 1836 error_setg_errno(errp, -ret, "Can't pre-allocate"); 1837 } 1838out_with_err_set: 1839 blk_unref(blk); 1840 g_free(buf); 1841 1842 return ret; 1843} 1844 1845static int sd_create_prealloc(BlockdevOptionsSheepdog *location, int64_t size, 1846 Error **errp) 1847{ 1848 BlockDriverState *bs; 1849 Visitor *v; 1850 QObject *obj = NULL; 1851 QDict *qdict; 1852 int ret; 1853 1854 v = qobject_output_visitor_new(&obj); 1855 visit_type_BlockdevOptionsSheepdog(v, NULL, &location, &error_abort); 1856 visit_free(v); 1857 1858 qdict = qobject_to(QDict, obj); 1859 qdict_flatten(qdict); 1860 1861 qdict_put_str(qdict, "driver", "sheepdog"); 1862 1863 bs = bdrv_open(NULL, NULL, qdict, BDRV_O_PROTOCOL | BDRV_O_RDWR, errp); 1864 if (bs == NULL) { 1865 ret = -EIO; 1866 goto fail; 1867 } 1868 1869 ret = sd_prealloc(bs, 0, size, errp); 1870fail: 1871 bdrv_unref(bs); 1872 qobject_unref(qdict); 1873 return ret; 1874} 1875 1876static int parse_redundancy(BDRVSheepdogState *s, SheepdogRedundancy *opt) 1877{ 1878 struct SheepdogInode *inode = &s->inode; 1879 1880 switch (opt->type) { 1881 case SHEEPDOG_REDUNDANCY_TYPE_FULL: 1882 if (opt->u.full.copies > SD_MAX_COPIES || opt->u.full.copies < 1) { 1883 return -EINVAL; 1884 } 1885 inode->copy_policy = 0; 1886 inode->nr_copies = opt->u.full.copies; 1887 return 0; 1888 1889 case SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED: 1890 { 1891 int64_t copy = opt->u.erasure_coded.data_strips; 1892 int64_t parity = opt->u.erasure_coded.parity_strips; 1893 1894 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { 1895 return -EINVAL; 1896 } 1897 1898 if (parity >= SD_EC_MAX_STRIP || parity < 1) { 1899 return -EINVAL; 1900 } 1901 1902 /* 1903 * 4 bits for parity and 4 bits for data. 1904 * We have to compress upper data bits because it can't represent 16 1905 */ 1906 inode->copy_policy = ((copy / 2) << 4) + parity; 1907 inode->nr_copies = copy + parity; 1908 return 0; 1909 } 1910 1911 default: 1912 g_assert_not_reached(); 1913 } 1914 1915 return -EINVAL; 1916} 1917 1918/* 1919 * Sheepdog support two kinds of redundancy, full replication and erasure 1920 * coding. 1921 * 1922 * # create a fully replicated vdi with x copies 1923 * -o redundancy=x (1 <= x <= SD_MAX_COPIES) 1924 * 1925 * # create a erasure coded vdi with x data strips and y parity strips 1926 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) 1927 */ 1928static SheepdogRedundancy *parse_redundancy_str(const char *opt) 1929{ 1930 SheepdogRedundancy *redundancy; 1931 const char *n1, *n2; 1932 long copy, parity; 1933 char p[10]; 1934 int ret; 1935 1936 pstrcpy(p, sizeof(p), opt); 1937 n1 = strtok(p, ":"); 1938 n2 = strtok(NULL, ":"); 1939 1940 if (!n1) { 1941 return NULL; 1942 } 1943 1944 ret = qemu_strtol(n1, NULL, 10, &copy); 1945 if (ret < 0) { 1946 return NULL; 1947 } 1948 1949 redundancy = g_new0(SheepdogRedundancy, 1); 1950 if (!n2) { 1951 *redundancy = (SheepdogRedundancy) { 1952 .type = SHEEPDOG_REDUNDANCY_TYPE_FULL, 1953 .u.full.copies = copy, 1954 }; 1955 } else { 1956 ret = qemu_strtol(n2, NULL, 10, &parity); 1957 if (ret < 0) { 1958 g_free(redundancy); 1959 return NULL; 1960 } 1961 1962 *redundancy = (SheepdogRedundancy) { 1963 .type = SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED, 1964 .u.erasure_coded = { 1965 .data_strips = copy, 1966 .parity_strips = parity, 1967 }, 1968 }; 1969 } 1970 1971 return redundancy; 1972} 1973 1974static int parse_block_size_shift(BDRVSheepdogState *s, 1975 BlockdevCreateOptionsSheepdog *opts) 1976{ 1977 struct SheepdogInode *inode = &s->inode; 1978 uint64_t object_size; 1979 int obj_order; 1980 1981 if (opts->has_object_size) { 1982 object_size = opts->object_size; 1983 1984 if ((object_size - 1) & object_size) { /* not a power of 2? */ 1985 return -EINVAL; 1986 } 1987 obj_order = ctz32(object_size); 1988 if (obj_order < 20 || obj_order > 31) { 1989 return -EINVAL; 1990 } 1991 inode->block_size_shift = (uint8_t)obj_order; 1992 } 1993 1994 return 0; 1995} 1996 1997static int sd_co_create(BlockdevCreateOptions *options, Error **errp) 1998{ 1999 BlockdevCreateOptionsSheepdog *opts = &options->u.sheepdog; 2000 int ret = 0; 2001 uint32_t vid = 0; 2002 char *backing_file = NULL; 2003 char *buf = NULL; 2004 BDRVSheepdogState *s; 2005 uint64_t max_vdi_size; 2006 bool prealloc = false; 2007 2008 assert(options->driver == BLOCKDEV_DRIVER_SHEEPDOG); 2009 2010 s = g_new0(BDRVSheepdogState, 1); 2011 2012 /* Steal SocketAddress from QAPI, set NULL to prevent double free */ 2013 s->addr = opts->location->server; 2014 opts->location->server = NULL; 2015 2016 if (strlen(opts->location->vdi) >= sizeof(s->name)) { 2017 error_setg(errp, "'vdi' string too long"); 2018 ret = -EINVAL; 2019 goto out; 2020 } 2021 pstrcpy(s->name, sizeof(s->name), opts->location->vdi); 2022 2023 s->inode.vdi_size = opts->size; 2024 backing_file = opts->backing_file; 2025 2026 if (!opts->has_preallocation) { 2027 opts->preallocation = PREALLOC_MODE_OFF; 2028 } 2029 switch (opts->preallocation) { 2030 case PREALLOC_MODE_OFF: 2031 prealloc = false; 2032 break; 2033 case PREALLOC_MODE_FULL: 2034 prealloc = true; 2035 break; 2036 default: 2037 error_setg(errp, "Preallocation mode not supported for Sheepdog"); 2038 ret = -EINVAL; 2039 goto out; 2040 } 2041 2042 if (opts->has_redundancy) { 2043 ret = parse_redundancy(s, opts->redundancy); 2044 if (ret < 0) { 2045 error_setg(errp, "Invalid redundancy mode"); 2046 goto out; 2047 } 2048 } 2049 ret = parse_block_size_shift(s, opts); 2050 if (ret < 0) { 2051 error_setg(errp, "Invalid object_size." 2052 " obect_size needs to be power of 2" 2053 " and be limited from 2^20 to 2^31"); 2054 goto out; 2055 } 2056 2057 if (opts->has_backing_file) { 2058 BlockBackend *blk; 2059 BDRVSheepdogState *base; 2060 BlockDriver *drv; 2061 2062 /* Currently, only Sheepdog backing image is supported. */ 2063 drv = bdrv_find_protocol(opts->backing_file, true, NULL); 2064 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { 2065 error_setg(errp, "backing_file must be a sheepdog image"); 2066 ret = -EINVAL; 2067 goto out; 2068 } 2069 2070 blk = blk_new_open(opts->backing_file, NULL, NULL, 2071 BDRV_O_PROTOCOL, errp); 2072 if (blk == NULL) { 2073 ret = -EIO; 2074 goto out; 2075 } 2076 2077 base = blk_bs(blk)->opaque; 2078 2079 if (!is_snapshot(&base->inode)) { 2080 error_setg(errp, "cannot clone from a non snapshot vdi"); 2081 blk_unref(blk); 2082 ret = -EINVAL; 2083 goto out; 2084 } 2085 s->inode.vdi_id = base->inode.vdi_id; 2086 blk_unref(blk); 2087 } 2088 2089 s->aio_context = qemu_get_aio_context(); 2090 2091 /* if block_size_shift is not specified, get cluster default value */ 2092 if (s->inode.block_size_shift == 0) { 2093 SheepdogVdiReq hdr; 2094 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; 2095 int fd; 2096 unsigned int wlen = 0, rlen = 0; 2097 2098 fd = connect_to_sdog(s, errp); 2099 if (fd < 0) { 2100 ret = fd; 2101 goto out; 2102 } 2103 2104 memset(&hdr, 0, sizeof(hdr)); 2105 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; 2106 hdr.proto_ver = SD_PROTO_VER; 2107 2108 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, 2109 NULL, &wlen, &rlen); 2110 closesocket(fd); 2111 if (ret) { 2112 error_setg_errno(errp, -ret, "failed to get cluster default"); 2113 goto out; 2114 } 2115 if (rsp->result == SD_RES_SUCCESS) { 2116 s->inode.block_size_shift = rsp->block_size_shift; 2117 } else { 2118 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; 2119 } 2120 } 2121 2122 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; 2123 2124 if (s->inode.vdi_size > max_vdi_size) { 2125 error_setg(errp, "An image is too large." 2126 " The maximum image size is %"PRIu64 "GB", 2127 max_vdi_size / 1024 / 1024 / 1024); 2128 ret = -EINVAL; 2129 goto out; 2130 } 2131 2132 ret = do_sd_create(s, &vid, 0, errp); 2133 if (ret) { 2134 goto out; 2135 } 2136 2137 if (prealloc) { 2138 ret = sd_create_prealloc(opts->location, opts->size, errp); 2139 } 2140out: 2141 g_free(backing_file); 2142 g_free(buf); 2143 g_free(s->addr); 2144 g_free(s); 2145 return ret; 2146} 2147 2148static int coroutine_fn sd_co_create_opts(BlockDriver *drv, 2149 const char *filename, 2150 QemuOpts *opts, 2151 Error **errp) 2152{ 2153 BlockdevCreateOptions *create_options = NULL; 2154 QDict *qdict, *location_qdict; 2155 Visitor *v; 2156 char *redundancy; 2157 Error *local_err = NULL; 2158 int ret; 2159 2160 redundancy = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY); 2161 2162 qdict = qemu_opts_to_qdict(opts, NULL); 2163 qdict_put_str(qdict, "driver", "sheepdog"); 2164 2165 location_qdict = qdict_new(); 2166 qdict_put(qdict, "location", location_qdict); 2167 2168 sd_parse_filename(filename, location_qdict, &local_err); 2169 if (local_err) { 2170 error_propagate(errp, local_err); 2171 ret = -EINVAL; 2172 goto fail; 2173 } 2174 2175 qdict_flatten(qdict); 2176 2177 /* Change legacy command line options into QMP ones */ 2178 static const QDictRenames opt_renames[] = { 2179 { BLOCK_OPT_BACKING_FILE, "backing-file" }, 2180 { BLOCK_OPT_OBJECT_SIZE, "object-size" }, 2181 { NULL, NULL }, 2182 }; 2183 2184 if (!qdict_rename_keys(qdict, opt_renames, errp)) { 2185 ret = -EINVAL; 2186 goto fail; 2187 } 2188 2189 /* Get the QAPI object */ 2190 v = qobject_input_visitor_new_flat_confused(qdict, errp); 2191 if (!v) { 2192 ret = -EINVAL; 2193 goto fail; 2194 } 2195 2196 visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp); 2197 visit_free(v); 2198 if (!create_options) { 2199 ret = -EINVAL; 2200 goto fail; 2201 } 2202 2203 assert(create_options->driver == BLOCKDEV_DRIVER_SHEEPDOG); 2204 create_options->u.sheepdog.size = 2205 ROUND_UP(create_options->u.sheepdog.size, BDRV_SECTOR_SIZE); 2206 2207 if (redundancy) { 2208 create_options->u.sheepdog.has_redundancy = true; 2209 create_options->u.sheepdog.redundancy = 2210 parse_redundancy_str(redundancy); 2211 if (create_options->u.sheepdog.redundancy == NULL) { 2212 error_setg(errp, "Invalid redundancy mode"); 2213 ret = -EINVAL; 2214 goto fail; 2215 } 2216 } 2217 2218 ret = sd_co_create(create_options, errp); 2219fail: 2220 qapi_free_BlockdevCreateOptions(create_options); 2221 qobject_unref(qdict); 2222 g_free(redundancy); 2223 return ret; 2224} 2225 2226static void sd_close(BlockDriverState *bs) 2227{ 2228 Error *local_err = NULL; 2229 BDRVSheepdogState *s = bs->opaque; 2230 SheepdogVdiReq hdr; 2231 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; 2232 unsigned int wlen, rlen = 0; 2233 int fd, ret; 2234 2235 trace_sheepdog_close(s->name); 2236 2237 fd = connect_to_sdog(s, &local_err); 2238 if (fd < 0) { 2239 error_report_err(local_err); 2240 return; 2241 } 2242 2243 memset(&hdr, 0, sizeof(hdr)); 2244 2245 hdr.opcode = SD_OP_RELEASE_VDI; 2246 hdr.type = LOCK_TYPE_NORMAL; 2247 hdr.base_vdi_id = s->inode.vdi_id; 2248 wlen = strlen(s->name) + 1; 2249 hdr.data_length = wlen; 2250 hdr.flags = SD_FLAG_CMD_WRITE; 2251 2252 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, 2253 s->name, &wlen, &rlen); 2254 2255 closesocket(fd); 2256 2257 if (!ret && rsp->result != SD_RES_SUCCESS && 2258 rsp->result != SD_RES_VDI_NOT_LOCKED) { 2259 error_report("%s, %s", sd_strerror(rsp->result), s->name); 2260 } 2261 2262 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, 2263 false, NULL, NULL, NULL, NULL); 2264 closesocket(s->fd); 2265 qapi_free_SocketAddress(s->addr); 2266} 2267 2268static int64_t sd_getlength(BlockDriverState *bs) 2269{ 2270 BDRVSheepdogState *s = bs->opaque; 2271 2272 return s->inode.vdi_size; 2273} 2274 2275static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset, 2276 bool exact, PreallocMode prealloc, 2277 BdrvRequestFlags flags, Error **errp) 2278{ 2279 BDRVSheepdogState *s = bs->opaque; 2280 int ret, fd; 2281 unsigned int datalen; 2282 uint64_t max_vdi_size; 2283 int64_t old_size = s->inode.vdi_size; 2284 2285 if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) { 2286 error_setg(errp, "Unsupported preallocation mode '%s'", 2287 PreallocMode_str(prealloc)); 2288 return -ENOTSUP; 2289 } 2290 2291 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; 2292 if (offset < old_size) { 2293 error_setg(errp, "shrinking is not supported"); 2294 return -EINVAL; 2295 } else if (offset > max_vdi_size) { 2296 error_setg(errp, "too big image size"); 2297 return -EINVAL; 2298 } 2299 2300 fd = connect_to_sdog(s, errp); 2301 if (fd < 0) { 2302 return fd; 2303 } 2304 2305 /* we don't need to update entire object */ 2306 datalen = SD_INODE_HEADER_SIZE; 2307 s->inode.vdi_size = offset; 2308 ret = write_object(fd, s->bs, (char *)&s->inode, 2309 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, 2310 datalen, 0, false, s->cache_flags); 2311 close(fd); 2312 2313 if (ret < 0) { 2314 error_setg_errno(errp, -ret, "failed to update an inode"); 2315 return ret; 2316 } 2317 2318 if (prealloc == PREALLOC_MODE_FULL) { 2319 ret = sd_prealloc(bs, old_size, offset, errp); 2320 if (ret < 0) { 2321 return ret; 2322 } 2323 } 2324 2325 return 0; 2326} 2327 2328/* 2329 * This function is called after writing data objects. If we need to 2330 * update metadata, this sends a write request to the vdi object. 2331 */ 2332static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) 2333{ 2334 BDRVSheepdogState *s = acb->s; 2335 struct iovec iov; 2336 AIOReq *aio_req; 2337 uint32_t offset, data_len, mn, mx; 2338 2339 mn = acb->min_dirty_data_idx; 2340 mx = acb->max_dirty_data_idx; 2341 if (mn <= mx) { 2342 /* we need to update the vdi object. */ 2343 ++acb->nr_pending; 2344 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + 2345 mn * sizeof(s->inode.data_vdi_id[0]); 2346 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); 2347 2348 acb->min_dirty_data_idx = UINT32_MAX; 2349 acb->max_dirty_data_idx = 0; 2350 2351 iov.iov_base = &s->inode; 2352 iov.iov_len = sizeof(s->inode); 2353 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), 2354 data_len, offset, 0, false, 0, offset); 2355 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); 2356 if (--acb->nr_pending) { 2357 qemu_coroutine_yield(); 2358 } 2359 } 2360} 2361 2362/* Delete current working VDI on the snapshot chain */ 2363static bool sd_delete(BDRVSheepdogState *s) 2364{ 2365 Error *local_err = NULL; 2366 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; 2367 SheepdogVdiReq hdr = { 2368 .opcode = SD_OP_DEL_VDI, 2369 .base_vdi_id = s->inode.vdi_id, 2370 .data_length = wlen, 2371 .flags = SD_FLAG_CMD_WRITE, 2372 }; 2373 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; 2374 int fd, ret; 2375 2376 fd = connect_to_sdog(s, &local_err); 2377 if (fd < 0) { 2378 error_report_err(local_err); 2379 return false; 2380 } 2381 2382 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, 2383 s->name, &wlen, &rlen); 2384 closesocket(fd); 2385 if (ret) { 2386 return false; 2387 } 2388 switch (rsp->result) { 2389 case SD_RES_NO_VDI: 2390 error_report("%s was already deleted", s->name); 2391 /* fall through */ 2392 case SD_RES_SUCCESS: 2393 break; 2394 default: 2395 error_report("%s, %s", sd_strerror(rsp->result), s->name); 2396 return false; 2397 } 2398 2399 return true; 2400} 2401 2402/* 2403 * Create a writable VDI from a snapshot 2404 */ 2405static int sd_create_branch(BDRVSheepdogState *s) 2406{ 2407 Error *local_err = NULL; 2408 int ret, fd; 2409 uint32_t vid; 2410 char *buf; 2411 bool deleted; 2412 2413 trace_sheepdog_create_branch_snapshot(s->inode.vdi_id); 2414 2415 buf = g_malloc(SD_INODE_SIZE); 2416 2417 /* 2418 * Even If deletion fails, we will just create extra snapshot based on 2419 * the working VDI which was supposed to be deleted. So no need to 2420 * false bail out. 2421 */ 2422 deleted = sd_delete(s); 2423 ret = do_sd_create(s, &vid, !deleted, &local_err); 2424 if (ret) { 2425 error_report_err(local_err); 2426 goto out; 2427 } 2428 2429 trace_sheepdog_create_branch_created(vid); 2430 2431 fd = connect_to_sdog(s, &local_err); 2432 if (fd < 0) { 2433 error_report_err(local_err); 2434 ret = fd; 2435 goto out; 2436 } 2437 2438 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid), 2439 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); 2440 2441 closesocket(fd); 2442 2443 if (ret < 0) { 2444 goto out; 2445 } 2446 2447 memcpy(&s->inode, buf, sizeof(s->inode)); 2448 2449 s->is_snapshot = false; 2450 ret = 0; 2451 trace_sheepdog_create_branch_new(s->inode.vdi_id); 2452 2453out: 2454 g_free(buf); 2455 2456 return ret; 2457} 2458 2459/* 2460 * Send I/O requests to the server. 2461 * 2462 * This function sends requests to the server, links the requests to 2463 * the inflight_list in BDRVSheepdogState, and exits without 2464 * waiting the response. The responses are received in the 2465 * `aio_read_response' function which is called from the main loop as 2466 * a fd handler. 2467 * 2468 * Returns 1 when we need to wait a response, 0 when there is no sent 2469 * request and -errno in error cases. 2470 */ 2471static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb) 2472{ 2473 int ret = 0; 2474 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; 2475 unsigned long idx; 2476 uint32_t object_size; 2477 uint64_t oid; 2478 uint64_t offset; 2479 BDRVSheepdogState *s = acb->s; 2480 SheepdogInode *inode = &s->inode; 2481 AIOReq *aio_req; 2482 2483 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { 2484 /* 2485 * In the case we open the snapshot VDI, Sheepdog creates the 2486 * writable VDI when we do a write operation first. 2487 */ 2488 ret = sd_create_branch(s); 2489 if (ret) { 2490 acb->ret = -EIO; 2491 return; 2492 } 2493 } 2494 2495 object_size = (UINT32_C(1) << inode->block_size_shift); 2496 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; 2497 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; 2498 2499 /* 2500 * Make sure we don't free the aiocb before we are done with all requests. 2501 * This additional reference is dropped at the end of this function. 2502 */ 2503 acb->nr_pending++; 2504 2505 while (done != total) { 2506 uint8_t flags = 0; 2507 uint64_t old_oid = 0; 2508 bool create = false; 2509 2510 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); 2511 2512 len = MIN(total - done, object_size - offset); 2513 2514 switch (acb->aiocb_type) { 2515 case AIOCB_READ_UDATA: 2516 if (!inode->data_vdi_id[idx]) { 2517 qemu_iovec_memset(acb->qiov, done, 0, len); 2518 goto done; 2519 } 2520 break; 2521 case AIOCB_WRITE_UDATA: 2522 if (!inode->data_vdi_id[idx]) { 2523 create = true; 2524 } else if (!is_data_obj_writable(inode, idx)) { 2525 /* Copy-On-Write */ 2526 create = true; 2527 old_oid = oid; 2528 flags = SD_FLAG_CMD_COW; 2529 } 2530 break; 2531 case AIOCB_DISCARD_OBJ: 2532 /* 2533 * We discard the object only when the whole object is 2534 * 1) allocated 2) trimmed. Otherwise, simply skip it. 2535 */ 2536 if (len != object_size || inode->data_vdi_id[idx] == 0) { 2537 goto done; 2538 } 2539 break; 2540 default: 2541 break; 2542 } 2543 2544 if (create) { 2545 trace_sheepdog_co_rw_vector_update(inode->vdi_id, oid, 2546 vid_to_data_oid(inode->data_vdi_id[idx], idx), 2547 idx); 2548 oid = vid_to_data_oid(inode->vdi_id, idx); 2549 trace_sheepdog_co_rw_vector_new(oid); 2550 } 2551 2552 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, 2553 old_oid, 2554 acb->aiocb_type == AIOCB_DISCARD_OBJ ? 2555 0 : done); 2556 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, 2557 acb->aiocb_type); 2558 done: 2559 offset = 0; 2560 idx++; 2561 done += len; 2562 } 2563 if (--acb->nr_pending) { 2564 qemu_coroutine_yield(); 2565 } 2566} 2567 2568static void sd_aio_complete(SheepdogAIOCB *acb) 2569{ 2570 BDRVSheepdogState *s; 2571 if (acb->aiocb_type == AIOCB_FLUSH_CACHE) { 2572 return; 2573 } 2574 2575 s = acb->s; 2576 qemu_co_mutex_lock(&s->queue_lock); 2577 QLIST_REMOVE(acb, aiocb_siblings); 2578 qemu_co_queue_restart_all(&s->overlapping_queue); 2579 qemu_co_mutex_unlock(&s->queue_lock); 2580} 2581 2582static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, 2583 int nb_sectors, QEMUIOVector *qiov, 2584 int flags) 2585{ 2586 SheepdogAIOCB acb; 2587 int ret; 2588 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; 2589 BDRVSheepdogState *s = bs->opaque; 2590 2591 assert(!flags); 2592 if (offset > s->inode.vdi_size) { 2593 ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, 0, NULL); 2594 if (ret < 0) { 2595 return ret; 2596 } 2597 } 2598 2599 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA); 2600 sd_co_rw_vector(&acb); 2601 sd_write_done(&acb); 2602 sd_aio_complete(&acb); 2603 2604 return acb.ret; 2605} 2606 2607static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, 2608 int nb_sectors, QEMUIOVector *qiov) 2609{ 2610 SheepdogAIOCB acb; 2611 BDRVSheepdogState *s = bs->opaque; 2612 2613 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA); 2614 sd_co_rw_vector(&acb); 2615 sd_aio_complete(&acb); 2616 2617 return acb.ret; 2618} 2619 2620static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) 2621{ 2622 BDRVSheepdogState *s = bs->opaque; 2623 SheepdogAIOCB acb; 2624 AIOReq *aio_req; 2625 2626 if (s->cache_flags != SD_FLAG_CMD_CACHE) { 2627 return 0; 2628 } 2629 2630 sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE); 2631 2632 acb.nr_pending++; 2633 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id), 2634 0, 0, 0, false, 0, 0); 2635 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type); 2636 2637 if (--acb.nr_pending) { 2638 qemu_coroutine_yield(); 2639 } 2640 2641 sd_aio_complete(&acb); 2642 return acb.ret; 2643} 2644 2645static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) 2646{ 2647 Error *local_err = NULL; 2648 BDRVSheepdogState *s = bs->opaque; 2649 int ret, fd; 2650 uint32_t new_vid; 2651 SheepdogInode *inode; 2652 unsigned int datalen; 2653 2654 trace_sheepdog_snapshot_create_info(sn_info->name, sn_info->id_str, s->name, 2655 sn_info->vm_state_size, s->is_snapshot); 2656 2657 if (s->is_snapshot) { 2658 error_report("You can't create a snapshot of a snapshot VDI, " 2659 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id); 2660 2661 return -EINVAL; 2662 } 2663 2664 trace_sheepdog_snapshot_create(sn_info->name, sn_info->id_str); 2665 2666 s->inode.vm_state_size = sn_info->vm_state_size; 2667 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; 2668 /* It appears that inode.tag does not require a NUL terminator, 2669 * which means this use of strncpy is ok. 2670 */ 2671 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); 2672 /* we don't need to update entire object */ 2673 datalen = SD_INODE_HEADER_SIZE; 2674 inode = g_malloc(datalen); 2675 2676 /* refresh inode. */ 2677 fd = connect_to_sdog(s, &local_err); 2678 if (fd < 0) { 2679 error_report_err(local_err); 2680 ret = fd; 2681 goto cleanup; 2682 } 2683 2684 ret = write_object(fd, s->bs, (char *)&s->inode, 2685 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, 2686 datalen, 0, false, s->cache_flags); 2687 if (ret < 0) { 2688 error_report("failed to write snapshot's inode."); 2689 goto cleanup; 2690 } 2691 2692 ret = do_sd_create(s, &new_vid, 1, &local_err); 2693 if (ret < 0) { 2694 error_reportf_err(local_err, 2695 "failed to create inode for snapshot: "); 2696 goto cleanup; 2697 } 2698 2699 ret = read_object(fd, s->bs, (char *)inode, 2700 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, 2701 s->cache_flags); 2702 2703 if (ret < 0) { 2704 error_report("failed to read new inode info. %s", strerror(errno)); 2705 goto cleanup; 2706 } 2707 2708 memcpy(&s->inode, inode, datalen); 2709 trace_sheepdog_snapshot_create_inode(s->inode.name, s->inode.snap_id, 2710 s->inode.vdi_id); 2711 2712cleanup: 2713 g_free(inode); 2714 closesocket(fd); 2715 return ret; 2716} 2717 2718/* 2719 * We implement rollback(loadvm) operation to the specified snapshot by 2720 * 1) switch to the snapshot 2721 * 2) rely on sd_create_branch to delete working VDI and 2722 * 3) create a new working VDI based on the specified snapshot 2723 */ 2724static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) 2725{ 2726 BDRVSheepdogState *s = bs->opaque; 2727 BDRVSheepdogState *old_s; 2728 char tag[SD_MAX_VDI_TAG_LEN]; 2729 uint32_t snapid = 0; 2730 int ret; 2731 2732 if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) { 2733 return -EINVAL; 2734 } 2735 2736 old_s = g_new(BDRVSheepdogState, 1); 2737 2738 memcpy(old_s, s, sizeof(BDRVSheepdogState)); 2739 2740 ret = reload_inode(s, snapid, tag); 2741 if (ret) { 2742 goto out; 2743 } 2744 2745 ret = sd_create_branch(s); 2746 if (ret) { 2747 goto out; 2748 } 2749 2750 g_free(old_s); 2751 2752 return 0; 2753out: 2754 /* recover bdrv_sd_state */ 2755 memcpy(s, old_s, sizeof(BDRVSheepdogState)); 2756 g_free(old_s); 2757 2758 error_report("failed to open. recover old bdrv_sd_state."); 2759 2760 return ret; 2761} 2762 2763#define NR_BATCHED_DISCARD 128 2764 2765static int remove_objects(BDRVSheepdogState *s, Error **errp) 2766{ 2767 int fd, i = 0, nr_objs = 0; 2768 int ret; 2769 SheepdogInode *inode = &s->inode; 2770 2771 fd = connect_to_sdog(s, errp); 2772 if (fd < 0) { 2773 return fd; 2774 } 2775 2776 nr_objs = count_data_objs(inode); 2777 while (i < nr_objs) { 2778 int start_idx, nr_filled_idx; 2779 2780 while (i < nr_objs && !inode->data_vdi_id[i]) { 2781 i++; 2782 } 2783 start_idx = i; 2784 2785 nr_filled_idx = 0; 2786 while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) { 2787 if (inode->data_vdi_id[i]) { 2788 inode->data_vdi_id[i] = 0; 2789 nr_filled_idx++; 2790 } 2791 2792 i++; 2793 } 2794 2795 ret = write_object(fd, s->bs, 2796 (char *)&inode->data_vdi_id[start_idx], 2797 vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies, 2798 (i - start_idx) * sizeof(uint32_t), 2799 offsetof(struct SheepdogInode, 2800 data_vdi_id[start_idx]), 2801 false, s->cache_flags); 2802 if (ret < 0) { 2803 error_setg(errp, "Failed to discard snapshot inode"); 2804 goto out; 2805 } 2806 } 2807 2808 ret = 0; 2809out: 2810 closesocket(fd); 2811 return ret; 2812} 2813 2814static int sd_snapshot_delete(BlockDriverState *bs, 2815 const char *snapshot_id, 2816 const char *name, 2817 Error **errp) 2818{ 2819 /* 2820 * FIXME should delete the snapshot matching both @snapshot_id and 2821 * @name, but @name not used here 2822 */ 2823 unsigned long snap_id = 0; 2824 char snap_tag[SD_MAX_VDI_TAG_LEN]; 2825 int fd, ret; 2826 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; 2827 BDRVSheepdogState *s = bs->opaque; 2828 unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0; 2829 uint32_t vid; 2830 SheepdogVdiReq hdr = { 2831 .opcode = SD_OP_DEL_VDI, 2832 .data_length = wlen, 2833 .flags = SD_FLAG_CMD_WRITE, 2834 }; 2835 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; 2836 2837 ret = remove_objects(s, errp); 2838 if (ret) { 2839 return ret; 2840 } 2841 2842 memset(buf, 0, sizeof(buf)); 2843 memset(snap_tag, 0, sizeof(snap_tag)); 2844 pstrcpy(buf, SD_MAX_VDI_LEN, s->name); 2845 /* TODO Use sd_parse_snapid() once this mess is cleaned up */ 2846 ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id); 2847 if (ret || snap_id > UINT32_MAX) { 2848 /* 2849 * FIXME Since qemu_strtoul() returns -EINVAL when 2850 * @snapshot_id is null, @snapshot_id is mandatory. Correct 2851 * would be to require at least one of @snapshot_id and @name. 2852 */ 2853 error_setg(errp, "Invalid snapshot ID: %s", 2854 snapshot_id ? snapshot_id : "<null>"); 2855 return -EINVAL; 2856 } 2857 2858 if (snap_id) { 2859 hdr.snapid = (uint32_t) snap_id; 2860 } else { 2861 /* FIXME I suspect we should use @name here */ 2862 /* FIXME don't truncate silently */ 2863 pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id); 2864 pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); 2865 } 2866 2867 ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp); 2868 if (ret) { 2869 return ret; 2870 } 2871 2872 fd = connect_to_sdog(s, errp); 2873 if (fd < 0) { 2874 return fd; 2875 } 2876 2877 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, 2878 buf, &wlen, &rlen); 2879 closesocket(fd); 2880 if (ret) { 2881 error_setg_errno(errp, -ret, "Couldn't send request to server"); 2882 return ret; 2883 } 2884 2885 switch (rsp->result) { 2886 case SD_RES_NO_VDI: 2887 error_setg(errp, "Can't find the snapshot"); 2888 return -ENOENT; 2889 case SD_RES_SUCCESS: 2890 break; 2891 default: 2892 error_setg(errp, "%s", sd_strerror(rsp->result)); 2893 return -EIO; 2894 } 2895 2896 return 0; 2897} 2898 2899static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) 2900{ 2901 Error *local_err = NULL; 2902 BDRVSheepdogState *s = bs->opaque; 2903 SheepdogReq req; 2904 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); 2905 QEMUSnapshotInfo *sn_tab = NULL; 2906 unsigned wlen, rlen; 2907 int found = 0; 2908 SheepdogInode *inode; 2909 unsigned long *vdi_inuse; 2910 unsigned int start_nr; 2911 uint64_t hval; 2912 uint32_t vid; 2913 2914 vdi_inuse = g_malloc(max); 2915 inode = g_malloc(SD_INODE_HEADER_SIZE); 2916 2917 fd = connect_to_sdog(s, &local_err); 2918 if (fd < 0) { 2919 error_report_err(local_err); 2920 ret = fd; 2921 goto out; 2922 } 2923 2924 rlen = max; 2925 wlen = 0; 2926 2927 memset(&req, 0, sizeof(req)); 2928 2929 req.opcode = SD_OP_READ_VDIS; 2930 req.data_length = max; 2931 2932 ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen); 2933 2934 closesocket(fd); 2935 if (ret) { 2936 goto out; 2937 } 2938 2939 sn_tab = g_new0(QEMUSnapshotInfo, nr); 2940 2941 /* calculate a vdi id with hash function */ 2942 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); 2943 start_nr = hval & (SD_NR_VDIS - 1); 2944 2945 fd = connect_to_sdog(s, &local_err); 2946 if (fd < 0) { 2947 error_report_err(local_err); 2948 ret = fd; 2949 goto out; 2950 } 2951 2952 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) { 2953 if (!test_bit(vid, vdi_inuse)) { 2954 break; 2955 } 2956 2957 /* we don't need to read entire object */ 2958 ret = read_object(fd, s->bs, (char *)inode, 2959 vid_to_vdi_oid(vid), 2960 0, SD_INODE_HEADER_SIZE, 0, 2961 s->cache_flags); 2962 2963 if (ret) { 2964 continue; 2965 } 2966 2967 if (!strcmp(inode->name, s->name) && is_snapshot(inode)) { 2968 sn_tab[found].date_sec = inode->snap_ctime >> 32; 2969 sn_tab[found].date_nsec = inode->snap_ctime & 0xffffffff; 2970 sn_tab[found].vm_state_size = inode->vm_state_size; 2971 sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec; 2972 2973 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), 2974 "%" PRIu32, inode->snap_id); 2975 pstrcpy(sn_tab[found].name, 2976 MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)), 2977 inode->tag); 2978 found++; 2979 } 2980 } 2981 2982 closesocket(fd); 2983out: 2984 *psn_tab = sn_tab; 2985 2986 g_free(vdi_inuse); 2987 g_free(inode); 2988 2989 if (ret < 0) { 2990 return ret; 2991 } 2992 2993 return found; 2994} 2995 2996static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, 2997 int64_t pos, int size, int load) 2998{ 2999 Error *local_err = NULL; 3000 bool create; 3001 int fd, ret = 0, remaining = size; 3002 unsigned int data_len; 3003 uint64_t vmstate_oid; 3004 uint64_t offset; 3005 uint32_t vdi_index; 3006 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; 3007 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); 3008 3009 fd = connect_to_sdog(s, &local_err); 3010 if (fd < 0) { 3011 error_report_err(local_err); 3012 return fd; 3013 } 3014 3015 while (remaining) { 3016 vdi_index = pos / object_size; 3017 offset = pos % object_size; 3018 3019 data_len = MIN(remaining, object_size - offset); 3020 3021 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); 3022 3023 create = (offset == 0); 3024 if (load) { 3025 ret = read_object(fd, s->bs, (char *)data, vmstate_oid, 3026 s->inode.nr_copies, data_len, offset, 3027 s->cache_flags); 3028 } else { 3029 ret = write_object(fd, s->bs, (char *)data, vmstate_oid, 3030 s->inode.nr_copies, data_len, offset, create, 3031 s->cache_flags); 3032 } 3033 3034 if (ret < 0) { 3035 error_report("failed to save vmstate %s", strerror(errno)); 3036 goto cleanup; 3037 } 3038 3039 pos += data_len; 3040 data += data_len; 3041 remaining -= data_len; 3042 } 3043 ret = size; 3044cleanup: 3045 closesocket(fd); 3046 return ret; 3047} 3048 3049static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 3050 int64_t pos) 3051{ 3052 BDRVSheepdogState *s = bs->opaque; 3053 void *buf; 3054 int ret; 3055 3056 buf = qemu_blockalign(bs, qiov->size); 3057 qemu_iovec_to_buf(qiov, 0, buf, qiov->size); 3058 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0); 3059 qemu_vfree(buf); 3060 3061 return ret; 3062} 3063 3064static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 3065 int64_t pos) 3066{ 3067 BDRVSheepdogState *s = bs->opaque; 3068 void *buf; 3069 int ret; 3070 3071 buf = qemu_blockalign(bs, qiov->size); 3072 ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1); 3073 qemu_iovec_from_buf(qiov, 0, buf, qiov->size); 3074 qemu_vfree(buf); 3075 3076 return ret; 3077} 3078 3079 3080static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset, 3081 int bytes) 3082{ 3083 SheepdogAIOCB acb; 3084 BDRVSheepdogState *s = bs->opaque; 3085 QEMUIOVector discard_iov; 3086 struct iovec iov; 3087 uint32_t zero = 0; 3088 3089 if (!s->discard_supported) { 3090 return 0; 3091 } 3092 3093 memset(&discard_iov, 0, sizeof(discard_iov)); 3094 memset(&iov, 0, sizeof(iov)); 3095 iov.iov_base = &zero; 3096 iov.iov_len = sizeof(zero); 3097 discard_iov.iov = &iov; 3098 discard_iov.niov = 1; 3099 if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) { 3100 return -ENOTSUP; 3101 } 3102 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS, 3103 bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ); 3104 sd_co_rw_vector(&acb); 3105 sd_aio_complete(&acb); 3106 3107 return acb.ret; 3108} 3109 3110static coroutine_fn int 3111sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, 3112 int64_t bytes, int64_t *pnum, int64_t *map, 3113 BlockDriverState **file) 3114{ 3115 BDRVSheepdogState *s = bs->opaque; 3116 SheepdogInode *inode = &s->inode; 3117 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); 3118 unsigned long start = offset / object_size, 3119 end = DIV_ROUND_UP(offset + bytes, object_size); 3120 unsigned long idx; 3121 *map = offset; 3122 int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 3123 3124 for (idx = start; idx < end; idx++) { 3125 if (inode->data_vdi_id[idx] == 0) { 3126 break; 3127 } 3128 } 3129 if (idx == start) { 3130 /* Get the longest length of unallocated sectors */ 3131 ret = 0; 3132 for (idx = start + 1; idx < end; idx++) { 3133 if (inode->data_vdi_id[idx] != 0) { 3134 break; 3135 } 3136 } 3137 } 3138 3139 *pnum = (idx - start) * object_size; 3140 if (*pnum > bytes) { 3141 *pnum = bytes; 3142 } 3143 if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { 3144 *file = bs; 3145 } 3146 return ret; 3147} 3148 3149static int64_t sd_get_allocated_file_size(BlockDriverState *bs) 3150{ 3151 BDRVSheepdogState *s = bs->opaque; 3152 SheepdogInode *inode = &s->inode; 3153 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); 3154 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); 3155 uint64_t size = 0; 3156 3157 for (i = 0; i < last; i++) { 3158 if (inode->data_vdi_id[i] == 0) { 3159 continue; 3160 } 3161 size += object_size; 3162 } 3163 return size; 3164} 3165 3166static QemuOptsList sd_create_opts = { 3167 .name = "sheepdog-create-opts", 3168 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head), 3169 .desc = { 3170 { 3171 .name = BLOCK_OPT_SIZE, 3172 .type = QEMU_OPT_SIZE, 3173 .help = "Virtual disk size" 3174 }, 3175 { 3176 .name = BLOCK_OPT_BACKING_FILE, 3177 .type = QEMU_OPT_STRING, 3178 .help = "File name of a base image" 3179 }, 3180 { 3181 .name = BLOCK_OPT_PREALLOC, 3182 .type = QEMU_OPT_STRING, 3183 .help = "Preallocation mode (allowed values: off, full)" 3184 }, 3185 { 3186 .name = BLOCK_OPT_REDUNDANCY, 3187 .type = QEMU_OPT_STRING, 3188 .help = "Redundancy of the image" 3189 }, 3190 { 3191 .name = BLOCK_OPT_OBJECT_SIZE, 3192 .type = QEMU_OPT_SIZE, 3193 .help = "Object size of the image" 3194 }, 3195 { /* end of list */ } 3196 } 3197}; 3198 3199static const char *const sd_strong_runtime_opts[] = { 3200 "vdi", 3201 "snap-id", 3202 "tag", 3203 "server.", 3204 3205 NULL 3206}; 3207 3208static BlockDriver bdrv_sheepdog = { 3209 .format_name = "sheepdog", 3210 .protocol_name = "sheepdog", 3211 .instance_size = sizeof(BDRVSheepdogState), 3212 .bdrv_parse_filename = sd_parse_filename, 3213 .bdrv_file_open = sd_open, 3214 .bdrv_reopen_prepare = sd_reopen_prepare, 3215 .bdrv_reopen_commit = sd_reopen_commit, 3216 .bdrv_reopen_abort = sd_reopen_abort, 3217 .bdrv_close = sd_close, 3218 .bdrv_co_create = sd_co_create, 3219 .bdrv_co_create_opts = sd_co_create_opts, 3220 .bdrv_has_zero_init = bdrv_has_zero_init_1, 3221 .bdrv_getlength = sd_getlength, 3222 .bdrv_get_allocated_file_size = sd_get_allocated_file_size, 3223 .bdrv_co_truncate = sd_co_truncate, 3224 3225 .bdrv_co_readv = sd_co_readv, 3226 .bdrv_co_writev = sd_co_writev, 3227 .bdrv_co_flush_to_disk = sd_co_flush_to_disk, 3228 .bdrv_co_pdiscard = sd_co_pdiscard, 3229 .bdrv_co_block_status = sd_co_block_status, 3230 3231 .bdrv_snapshot_create = sd_snapshot_create, 3232 .bdrv_snapshot_goto = sd_snapshot_goto, 3233 .bdrv_snapshot_delete = sd_snapshot_delete, 3234 .bdrv_snapshot_list = sd_snapshot_list, 3235 3236 .bdrv_save_vmstate = sd_save_vmstate, 3237 .bdrv_load_vmstate = sd_load_vmstate, 3238 3239 .bdrv_detach_aio_context = sd_detach_aio_context, 3240 .bdrv_attach_aio_context = sd_attach_aio_context, 3241 3242 .create_opts = &sd_create_opts, 3243 .strong_runtime_opts = sd_strong_runtime_opts, 3244}; 3245 3246static BlockDriver bdrv_sheepdog_tcp = { 3247 .format_name = "sheepdog", 3248 .protocol_name = "sheepdog+tcp", 3249 .instance_size = sizeof(BDRVSheepdogState), 3250 .bdrv_parse_filename = sd_parse_filename, 3251 .bdrv_file_open = sd_open, 3252 .bdrv_reopen_prepare = sd_reopen_prepare, 3253 .bdrv_reopen_commit = sd_reopen_commit, 3254 .bdrv_reopen_abort = sd_reopen_abort, 3255 .bdrv_close = sd_close, 3256 .bdrv_co_create = sd_co_create, 3257 .bdrv_co_create_opts = sd_co_create_opts, 3258 .bdrv_has_zero_init = bdrv_has_zero_init_1, 3259 .bdrv_getlength = sd_getlength, 3260 .bdrv_get_allocated_file_size = sd_get_allocated_file_size, 3261 .bdrv_co_truncate = sd_co_truncate, 3262 3263 .bdrv_co_readv = sd_co_readv, 3264 .bdrv_co_writev = sd_co_writev, 3265 .bdrv_co_flush_to_disk = sd_co_flush_to_disk, 3266 .bdrv_co_pdiscard = sd_co_pdiscard, 3267 .bdrv_co_block_status = sd_co_block_status, 3268 3269 .bdrv_snapshot_create = sd_snapshot_create, 3270 .bdrv_snapshot_goto = sd_snapshot_goto, 3271 .bdrv_snapshot_delete = sd_snapshot_delete, 3272 .bdrv_snapshot_list = sd_snapshot_list, 3273 3274 .bdrv_save_vmstate = sd_save_vmstate, 3275 .bdrv_load_vmstate = sd_load_vmstate, 3276 3277 .bdrv_detach_aio_context = sd_detach_aio_context, 3278 .bdrv_attach_aio_context = sd_attach_aio_context, 3279 3280 .create_opts = &sd_create_opts, 3281 .strong_runtime_opts = sd_strong_runtime_opts, 3282}; 3283 3284static BlockDriver bdrv_sheepdog_unix = { 3285 .format_name = "sheepdog", 3286 .protocol_name = "sheepdog+unix", 3287 .instance_size = sizeof(BDRVSheepdogState), 3288 .bdrv_parse_filename = sd_parse_filename, 3289 .bdrv_file_open = sd_open, 3290 .bdrv_reopen_prepare = sd_reopen_prepare, 3291 .bdrv_reopen_commit = sd_reopen_commit, 3292 .bdrv_reopen_abort = sd_reopen_abort, 3293 .bdrv_close = sd_close, 3294 .bdrv_co_create = sd_co_create, 3295 .bdrv_co_create_opts = sd_co_create_opts, 3296 .bdrv_has_zero_init = bdrv_has_zero_init_1, 3297 .bdrv_getlength = sd_getlength, 3298 .bdrv_get_allocated_file_size = sd_get_allocated_file_size, 3299 .bdrv_co_truncate = sd_co_truncate, 3300 3301 .bdrv_co_readv = sd_co_readv, 3302 .bdrv_co_writev = sd_co_writev, 3303 .bdrv_co_flush_to_disk = sd_co_flush_to_disk, 3304 .bdrv_co_pdiscard = sd_co_pdiscard, 3305 .bdrv_co_block_status = sd_co_block_status, 3306 3307 .bdrv_snapshot_create = sd_snapshot_create, 3308 .bdrv_snapshot_goto = sd_snapshot_goto, 3309 .bdrv_snapshot_delete = sd_snapshot_delete, 3310 .bdrv_snapshot_list = sd_snapshot_list, 3311 3312 .bdrv_save_vmstate = sd_save_vmstate, 3313 .bdrv_load_vmstate = sd_load_vmstate, 3314 3315 .bdrv_detach_aio_context = sd_detach_aio_context, 3316 .bdrv_attach_aio_context = sd_attach_aio_context, 3317 3318 .create_opts = &sd_create_opts, 3319 .strong_runtime_opts = sd_strong_runtime_opts, 3320}; 3321 3322static void bdrv_sheepdog_init(void) 3323{ 3324 bdrv_register(&bdrv_sheepdog); 3325 bdrv_register(&bdrv_sheepdog_tcp); 3326 bdrv_register(&bdrv_sheepdog_unix); 3327} 3328block_init(bdrv_sheepdog_init);