本文拟对压缩qcow2镜像所带来的虚拟机性能损失进行简单分析
背景
生产中发现使用压缩镜像启动的虚拟机开机总是会慢一些。
qcow2镜像的压缩方式为:qemu-img convert -p -c -O qcow2 zero_disk.qcow2 compress_disk.qcow2
分析
qemu代码:https://download.qemu.org/qemu-9.2.0.tar.xz
主要看下使用qcow2后端的guest在读操作时后端操作qcow2文件的一个流程。
block/qcow2.c
后端driver指定读写回调:qcow2_co_preadv_part、qcow2_co_pwritev_part
BlockDriver bdrv_qcow2 = {
.format_name = "qcow2",
.instance_size = sizeof(BDRVQcow2State),
.bdrv_probe = qcow2_probe,
.bdrv_open = qcow2_open,
.bdrv_close = qcow2_close,
.bdrv_reopen_prepare = qcow2_reopen_prepare,
.bdrv_reopen_commit = qcow2_reopen_commit,
.bdrv_reopen_commit_post = qcow2_reopen_commit_post,
.bdrv_reopen_abort = qcow2_reopen_abort,
.bdrv_join_options = qcow2_join_options,
.bdrv_child_perm = bdrv_default_perms,
.bdrv_co_create_opts = qcow2_co_create_opts,
.bdrv_co_create = qcow2_co_create,
.bdrv_has_zero_init = qcow2_has_zero_init,
.bdrv_co_block_status = qcow2_co_block_status,
.bdrv_co_preadv_part = qcow2_co_preadv_part,
.bdrv_co_pwritev_part = qcow2_co_pwritev_part,
.bdrv_co_flush_to_os = qcow2_co_flush_to_os,
.bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes,
.bdrv_co_pdiscard = qcow2_co_pdiscard,
.bdrv_co_copy_range_from = qcow2_co_copy_range_from,
.bdrv_co_copy_range_to = qcow2_co_copy_range_to,
.bdrv_co_truncate = qcow2_co_truncate,
.bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
.bdrv_make_empty = qcow2_make_empty,
.bdrv_snapshot_create = qcow2_snapshot_create,
.bdrv_snapshot_goto = qcow2_snapshot_goto,
.bdrv_snapshot_delete = qcow2_snapshot_delete,
.bdrv_snapshot_list = qcow2_snapshot_list,
.bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
.bdrv_measure = qcow2_measure,
.bdrv_co_get_info = qcow2_co_get_info,
.bdrv_get_specific_info = qcow2_get_specific_info,
.bdrv_co_save_vmstate = qcow2_co_save_vmstate,
.bdrv_co_load_vmstate = qcow2_co_load_vmstate,
.is_format = true,
.supports_backing = true,
.bdrv_co_change_backing_file = qcow2_co_change_backing_file,
.bdrv_refresh_limits = qcow2_refresh_limits,
.bdrv_co_invalidate_cache = qcow2_co_invalidate_cache,
.bdrv_inactivate = qcow2_inactivate,
.create_opts = &qcow2_create_opts,
.amend_opts = &qcow2_amend_opts,
.strong_runtime_opts = qcow2_strong_runtime_opts,
.mutable_opts = mutable_opts,
.bdrv_co_check = qcow2_co_check,
.bdrv_amend_options = qcow2_amend_options,
.bdrv_co_amend = qcow2_co_amend,
.bdrv_detach_aio_context = qcow2_detach_aio_context,
.bdrv_attach_aio_context = qcow2_attach_aio_context,
.bdrv_supports_persistent_dirty_bitmap =
qcow2_supports_persistent_dirty_bitmap,
.bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
.bdrv_co_remove_persistent_dirty_bitmap =
qcow2_co_remove_persistent_dirty_bitmap,
};
block/qcow2.c
读取时走读回调:qcow2_co_preadv_task_entry
static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
QEMUIOVector *qiov, size_t qiov_offset,
BdrvRequestFlags flags)
{
BDRVQcow2State *s = bs->opaque;
int ret = 0;
unsigned int cur_bytes; /* number of bytes in current iteration */
uint64_t host_offset = 0;
QCow2SubclusterType type;
AioTaskPool *aio = NULL;
while (bytes != 0 && aio_task_pool_status(aio) == 0) {
/* prepare next request */
cur_bytes = MIN(bytes, INT_MAX);
if (s->crypto) {
cur_bytes = MIN(cur_bytes,
QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
}
qemu_co_mutex_lock(&s->lock);
ret = qcow2_get_host_offset(bs, offset, &cur_bytes,
&host_offset, &type);
qemu_co_mutex_unlock(&s->lock);
if (ret < 0) {
goto out;
}
if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
(type == QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN && !bs->backing) ||
(type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC && !bs->backing))
{
qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
} else {
if (!aio && cur_bytes != bytes) {
aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
}
ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
host_offset, offset, cur_bytes,
qiov, qiov_offset, NULL);
if (ret < 0) {
goto out;
}
}
bytes -= cur_bytes;
offset += cur_bytes;
qiov_offset += cur_bytes;
}
out:
if (aio) {
aio_task_pool_wait_all(aio);
if (ret == 0) {
ret = aio_task_pool_status(aio);
}
g_free(aio);
}
return ret;
}
qcow2_co_preadv_task
/*
* This function can count as GRAPH_RDLOCK because qcow2_co_preadv_part() holds
* the graph lock and keeps it until this coroutine has terminated.
*/
static int coroutine_fn GRAPH_RDLOCK qcow2_co_preadv_task_entry(AioTask *task)
{
Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
assert(!t->l2meta);
return qcow2_co_preadv_task(t->bs, t->subcluster_type,
t->host_offset, t->offset, t->bytes,
t->qiov, t->qiov_offset);
}
之后会判断当前读的cluster是什么类型的,如果是压缩的镜像,会走到QCOW2_SUBCLUSTER_COMPRESSED
static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
uint64_t host_offset, uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, size_t qiov_offset)
{
BDRVQcow2State *s = bs->opaque;
switch (subc_type) {
case QCOW2_SUBCLUSTER_ZERO_PLAIN:
case QCOW2_SUBCLUSTER_ZERO_ALLOC:
/* Both zero types are handled in qcow2_co_preadv_part */
g_assert_not_reached();
case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */
BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
return bdrv_co_preadv_part(bs->backing, offset, bytes,
qiov, qiov_offset, 0);
case QCOW2_SUBCLUSTER_COMPRESSED:
return qcow2_co_preadv_compressed(bs, host_offset,
offset, bytes, qiov, qiov_offset);
case QCOW2_SUBCLUSTER_NORMAL:
if (bs->encrypted) {
return qcow2_co_preadv_encrypted(bs, host_offset,
offset, bytes, qiov, qiov_offset);
}
BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
return bdrv_co_preadv_part(s->data_file, host_offset,
bytes, qiov, qiov_offset, 0);
default:
g_assert_not_reached();
}
g_assert_not_reached();
}
qcow2_co_preadv_compressed
static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_compressed(BlockDriverState *bs,
uint64_t l2_entry,
uint64_t offset,
uint64_t bytes,
QEMUIOVector *qiov,
size_t qiov_offset)
{
BDRVQcow2State *s = bs->opaque;
int ret = 0, csize;
uint64_t coffset;
uint8_t *buf, *out_buf;
int offset_in_cluster = offset_into_cluster(s, offset);
qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
buf = g_try_malloc(csize);
if (!buf) {
return -ENOMEM;
}
out_buf = qemu_blockalign(bs, s->cluster_size);
BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
if (ret < 0) {
goto fail;
}
if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
ret = -EIO;
goto fail;
}
qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);
fail:
qemu_vfree(out_buf);
g_free(buf);
return ret;
}
最后调用解压缩
/*
* qcow2_co_decompress()
*
* Decompress some data (not more than @src_size bytes) to produce exactly
* @dest_size bytes using the compression method defined by the image
* compression type
*
* @dest - destination buffer, @dest_size bytes
* @src - source buffer, @src_size bytes
*
* Returns: 0 on success
* a negative error code on failure
*/
ssize_t coroutine_fn
qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
const void *src, size_t src_size)
{
BDRVQcow2State *s = bs->opaque;
Qcow2CompressFunc fn;
switch (s->compression_type) {
case QCOW2_COMPRESSION_TYPE_ZLIB:
fn = qcow2_zlib_decompress;
break;
#ifdef CONFIG_ZSTD
case QCOW2_COMPRESSION_TYPE_ZSTD:
fn = qcow2_zstd_decompress;
break;
#endif
default:
abort();
}
return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn);
}
总结
从上面的流程分析可以看出,当qcow2被压缩后,guest每次需要读取原qcow2文件中的内容时,都会让host后端进行一次解压缩,这会消耗cpu
参考
https://github.com/qemu/qemu