qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

qcow2: add shrink image support

This patch add shrinking of the image file for qcow2. As a result, this allows
us to reduce the virtual image size and free up space on the disk without
copying the image. Image can be fragmented and shrink is done by punching holes
in the image file.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20170918124230.8152-4-pbutsykin@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>

authored by

Pavel Butsykin and committed by
Max Reitz
46b732cd f71c08ea

+225 -10
+50
block/qcow2-cluster.c
··· 32 32 #include "qemu/bswap.h" 33 33 #include "trace.h" 34 34 35 + int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size) 36 + { 37 + BDRVQcow2State *s = bs->opaque; 38 + int new_l1_size, i, ret; 39 + 40 + if (exact_size >= s->l1_size) { 41 + return 0; 42 + } 43 + 44 + new_l1_size = exact_size; 45 + 46 + #ifdef DEBUG_ALLOC2 47 + fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size); 48 + #endif 49 + 50 + BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); 51 + ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset + 52 + new_l1_size * sizeof(uint64_t), 53 + (s->l1_size - new_l1_size) * sizeof(uint64_t), 0); 54 + if (ret < 0) { 55 + goto fail; 56 + } 57 + 58 + ret = bdrv_flush(bs->file->bs); 59 + if (ret < 0) { 60 + goto fail; 61 + } 62 + 63 + BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); 64 + for (i = s->l1_size - 1; i > new_l1_size - 1; i--) { 65 + if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) { 66 + continue; 67 + } 68 + qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK, 69 + s->cluster_size, QCOW2_DISCARD_ALWAYS); 70 + s->l1_table[i] = 0; 71 + } 72 + return 0; 73 + 74 + fail: 75 + /* 76 + * If the write in the l1_table failed the image may contain a partially 77 + * overwritten l1_table. In this case it would be better to clear the 78 + * l1_table in memory to avoid possible image corruption. 79 + */ 80 + memset(s->l1_table + new_l1_size, 0, 81 + (s->l1_size - new_l1_size) * sizeof(uint64_t)); 82 + return ret; 83 + } 84 + 35 85 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 36 86 bool exact_size) 37 87 {
+120
block/qcow2-refcount.c
··· 29 29 #include "block/qcow2.h" 30 30 #include "qemu/range.h" 31 31 #include "qemu/bswap.h" 32 + #include "qemu/cutils.h" 32 33 33 34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); 34 35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, ··· 3061 3062 qemu_vfree(new_refblock); 3062 3063 return ret; 3063 3064 } 3065 + 3066 + static int qcow2_discard_refcount_block(BlockDriverState *bs, 3067 + uint64_t discard_block_offs) 3068 + { 3069 + BDRVQcow2State *s = bs->opaque; 3070 + uint64_t refblock_offs = get_refblock_offset(s, discard_block_offs); 3071 + uint64_t cluster_index = discard_block_offs >> s->cluster_bits; 3072 + uint32_t block_index = cluster_index & (s->refcount_block_size - 1); 3073 + void *refblock; 3074 + int ret; 3075 + 3076 + assert(discard_block_offs != 0); 3077 + 3078 + ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3079 + &refblock); 3080 + if (ret < 0) { 3081 + return ret; 3082 + } 3083 + 3084 + if (s->get_refcount(refblock, block_index) != 1) { 3085 + qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:" 3086 + " refblock offset %#" PRIx64 3087 + ", reftable index %u" 3088 + ", block offset %#" PRIx64 3089 + ", refcount %#" PRIx64, 3090 + refblock_offs, 3091 + offset_to_reftable_index(s, discard_block_offs), 3092 + discard_block_offs, 3093 + s->get_refcount(refblock, block_index)); 3094 + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3095 + return -EINVAL; 3096 + } 3097 + s->set_refcount(refblock, block_index, 0); 3098 + 3099 + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock); 3100 + 3101 + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3102 + 3103 + if (cluster_index < s->free_cluster_index) { 3104 + s->free_cluster_index = cluster_index; 3105 + } 3106 + 3107 + refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache, 3108 + discard_block_offs); 3109 + if (refblock) { 3110 + /* discard refblock from the cache if refblock is cached */ 3111 + qcow2_cache_discard(bs, s->refcount_block_cache, refblock); 3112 + } 3113 + update_refcount_discard(bs, discard_block_offs, s->cluster_size); 3114 + 3115 + return 0; 3116 + } 3117 + 3118 + int qcow2_shrink_reftable(BlockDriverState *bs) 3119 + { 3120 + BDRVQcow2State *s = bs->opaque; 3121 + uint64_t *reftable_tmp = 3122 + g_malloc(s->refcount_table_size * sizeof(uint64_t)); 3123 + int i, ret; 3124 + 3125 + for (i = 0; i < s->refcount_table_size; i++) { 3126 + int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK; 3127 + void *refblock; 3128 + bool unused_block; 3129 + 3130 + if (refblock_offs == 0) { 3131 + reftable_tmp[i] = 0; 3132 + continue; 3133 + } 3134 + ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3135 + &refblock); 3136 + if (ret < 0) { 3137 + goto out; 3138 + } 3139 + 3140 + /* the refblock has own reference */ 3141 + if (i == offset_to_reftable_index(s, refblock_offs)) { 3142 + uint64_t block_index = (refblock_offs >> s->cluster_bits) & 3143 + (s->refcount_block_size - 1); 3144 + uint64_t refcount = s->get_refcount(refblock, block_index); 3145 + 3146 + s->set_refcount(refblock, block_index, 0); 3147 + 3148 + unused_block = buffer_is_zero(refblock, s->cluster_size); 3149 + 3150 + s->set_refcount(refblock, block_index, refcount); 3151 + } else { 3152 + unused_block = buffer_is_zero(refblock, s->cluster_size); 3153 + } 3154 + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3155 + 3156 + reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]); 3157 + } 3158 + 3159 + ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp, 3160 + s->refcount_table_size * sizeof(uint64_t)); 3161 + /* 3162 + * If the write in the reftable failed the image may contain a partially 3163 + * overwritten reftable. In this case it would be better to clear the 3164 + * reftable in memory to avoid possible image corruption. 3165 + */ 3166 + for (i = 0; i < s->refcount_table_size; i++) { 3167 + if (s->refcount_table[i] && !reftable_tmp[i]) { 3168 + if (ret == 0) { 3169 + ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] & 3170 + REFT_OFFSET_MASK); 3171 + } 3172 + s->refcount_table[i] = 0; 3173 + } 3174 + } 3175 + 3176 + if (!s->cache_discards) { 3177 + qcow2_process_discards(bs, ret); 3178 + } 3179 + 3180 + out: 3181 + g_free(reftable_tmp); 3182 + return ret; 3183 + }
+34 -9
block/qcow2.c
··· 3104 3104 } 3105 3105 3106 3106 old_length = bs->total_sectors * 512; 3107 + new_l1_size = size_to_l1(s, offset); 3107 3108 3108 - /* shrinking is currently not supported */ 3109 3109 if (offset < old_length) { 3110 - error_setg(errp, "qcow2 doesn't support shrinking images yet"); 3111 - return -ENOTSUP; 3112 - } 3110 + if (prealloc != PREALLOC_MODE_OFF) { 3111 + error_setg(errp, 3112 + "Preallocation can't be used for shrinking an image"); 3113 + return -EINVAL; 3114 + } 3115 + 3116 + ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size), 3117 + old_length - ROUND_UP(offset, 3118 + s->cluster_size), 3119 + QCOW2_DISCARD_ALWAYS, true); 3120 + if (ret < 0) { 3121 + error_setg_errno(errp, -ret, "Failed to discard cropped clusters"); 3122 + return ret; 3123 + } 3124 + 3125 + ret = qcow2_shrink_l1_table(bs, new_l1_size); 3126 + if (ret < 0) { 3127 + error_setg_errno(errp, -ret, 3128 + "Failed to reduce the number of L2 tables"); 3129 + return ret; 3130 + } 3113 3131 3114 - new_l1_size = size_to_l1(s, offset); 3115 - ret = qcow2_grow_l1_table(bs, new_l1_size, true); 3116 - if (ret < 0) { 3117 - error_setg_errno(errp, -ret, "Failed to grow the L1 table"); 3118 - return ret; 3132 + ret = qcow2_shrink_reftable(bs); 3133 + if (ret < 0) { 3134 + error_setg_errno(errp, -ret, 3135 + "Failed to discard unused refblocks"); 3136 + return ret; 3137 + } 3138 + } else { 3139 + ret = qcow2_grow_l1_table(bs, new_l1_size, true); 3140 + if (ret < 0) { 3141 + error_setg_errno(errp, -ret, "Failed to grow the L1 table"); 3142 + return ret; 3143 + } 3119 3144 } 3120 3145 3121 3146 switch (prealloc) {
+14
block/qcow2.h
··· 521 521 return r1 > r2 ? r1 - r2 : r2 - r1; 522 522 } 523 523 524 + static inline 525 + uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset) 526 + { 527 + return offset >> (s->refcount_block_bits + s->cluster_bits); 528 + } 529 + 530 + static inline uint64_t get_refblock_offset(BDRVQcow2State *s, uint64_t offset) 531 + { 532 + uint32_t index = offset_to_reftable_index(s, offset); 533 + return s->refcount_table[index] & REFT_OFFSET_MASK; 534 + } 535 + 524 536 /* qcow2.c functions */ 525 537 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, 526 538 int64_t sector_num, int nb_sectors); ··· 584 596 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, 585 597 BlockDriverAmendStatusCB *status_cb, 586 598 void *cb_opaque, Error **errp); 599 + int qcow2_shrink_reftable(BlockDriverState *bs); 587 600 588 601 /* qcow2-cluster.c functions */ 589 602 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 590 603 bool exact_size); 604 + int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size); 591 605 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); 592 606 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); 593 607 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
+7 -1
qapi/block-core.json
··· 2533 2533 # 2534 2534 # Trigger events supported by blkdebug. 2535 2535 # 2536 + # @l1_shrink_write_table: write zeros to the l1 table to shrink image. 2537 + # (since 2.11) 2538 + # 2539 + # @l1_shrink_free_l2_clusters: discard the l2 tables. (since 2.11) 2540 + # 2536 2541 # Since: 2.9 2537 2542 ## 2538 2543 { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG', ··· 2549 2554 'cluster_alloc_bytes', 'cluster_free', 'flush_to_os', 2550 2555 'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head', 2551 2556 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev', 2552 - 'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] } 2557 + 'pwritev_zero', 'pwritev_done', 'empty_image_prepare', 2558 + 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] } 2553 2559 2554 2560 ## 2555 2561 # @BlkdebugInjectErrorOptions: