本文拟对压缩qcow2镜像所带来的虚拟机性能损失进行简单分析

背景

生产中发现使用压缩镜像启动的虚拟机开机总是会慢一些。
qcow2镜像的压缩方式为:qemu-img convert -p -c -O qcow2 zero_disk.qcow2 compress_disk.qcow2

分析

qemu代码:https://download.qemu.org/qemu-9.2.0.tar.xz
主要看下使用qcow2后端的guest在读操作时后端操作qcow2文件的一个流程。

block/qcow2.c
后端driver指定读写回调:qcow2_co_preadv_part、qcow2_co_pwritev_part

BlockDriver bdrv_qcow2 = {
    .format_name                        = "qcow2",
    .instance_size                      = sizeof(BDRVQcow2State),
    .bdrv_probe                         = qcow2_probe,
    .bdrv_open                          = qcow2_open,
    .bdrv_close                         = qcow2_close,
    .bdrv_reopen_prepare                = qcow2_reopen_prepare,
    .bdrv_reopen_commit                 = qcow2_reopen_commit,
    .bdrv_reopen_commit_post            = qcow2_reopen_commit_post,
    .bdrv_reopen_abort                  = qcow2_reopen_abort,
    .bdrv_join_options                  = qcow2_join_options,
    .bdrv_child_perm                    = bdrv_default_perms,
    .bdrv_co_create_opts                = qcow2_co_create_opts,
    .bdrv_co_create                     = qcow2_co_create,
    .bdrv_has_zero_init                 = qcow2_has_zero_init,
    .bdrv_co_block_status               = qcow2_co_block_status,

    .bdrv_co_preadv_part                = qcow2_co_preadv_part,
    .bdrv_co_pwritev_part               = qcow2_co_pwritev_part,
    .bdrv_co_flush_to_os                = qcow2_co_flush_to_os,

    .bdrv_co_pwrite_zeroes              = qcow2_co_pwrite_zeroes,
    .bdrv_co_pdiscard                   = qcow2_co_pdiscard,
    .bdrv_co_copy_range_from            = qcow2_co_copy_range_from,
    .bdrv_co_copy_range_to              = qcow2_co_copy_range_to,
    .bdrv_co_truncate                   = qcow2_co_truncate,
    .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
    .bdrv_make_empty                    = qcow2_make_empty,

    .bdrv_snapshot_create               = qcow2_snapshot_create,
    .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
    .bdrv_snapshot_delete               = qcow2_snapshot_delete,
    .bdrv_snapshot_list                 = qcow2_snapshot_list,
    .bdrv_snapshot_load_tmp             = qcow2_snapshot_load_tmp,
    .bdrv_measure                       = qcow2_measure,
    .bdrv_co_get_info                   = qcow2_co_get_info,
    .bdrv_get_specific_info             = qcow2_get_specific_info,

    .bdrv_co_save_vmstate               = qcow2_co_save_vmstate,
    .bdrv_co_load_vmstate               = qcow2_co_load_vmstate,

    .is_format                          = true,
    .supports_backing                   = true,
    .bdrv_co_change_backing_file        = qcow2_co_change_backing_file,

    .bdrv_refresh_limits                = qcow2_refresh_limits,
    .bdrv_co_invalidate_cache           = qcow2_co_invalidate_cache,
    .bdrv_inactivate                    = qcow2_inactivate,

    .create_opts                        = &qcow2_create_opts,
    .amend_opts                         = &qcow2_amend_opts,
    .strong_runtime_opts                = qcow2_strong_runtime_opts,
    .mutable_opts                       = mutable_opts,
    .bdrv_co_check                      = qcow2_co_check,
    .bdrv_amend_options                 = qcow2_amend_options,
    .bdrv_co_amend                      = qcow2_co_amend,

    .bdrv_detach_aio_context            = qcow2_detach_aio_context,
    .bdrv_attach_aio_context            = qcow2_attach_aio_context,

    .bdrv_supports_persistent_dirty_bitmap =
            qcow2_supports_persistent_dirty_bitmap,
    .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
    .bdrv_co_remove_persistent_dirty_bitmap =
            qcow2_co_remove_persistent_dirty_bitmap,
};

block/qcow2.c
读取时走读回调:qcow2_co_preadv_task_entry

static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
                     QEMUIOVector *qiov, size_t qiov_offset,
                     BdrvRequestFlags flags)
{
    BDRVQcow2State *s = bs->opaque;
    int ret = 0;
    unsigned int cur_bytes; /* number of bytes in current iteration */
    uint64_t host_offset = 0;
    QCow2SubclusterType type;
    AioTaskPool *aio = NULL;

    while (bytes != 0 && aio_task_pool_status(aio) == 0) {
        /* prepare next request */
        cur_bytes = MIN(bytes, INT_MAX);
        if (s->crypto) {
            cur_bytes = MIN(cur_bytes,
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
        }

        qemu_co_mutex_lock(&s->lock);
        ret = qcow2_get_host_offset(bs, offset, &cur_bytes,
                                    &host_offset, &type);
        qemu_co_mutex_unlock(&s->lock);
        if (ret < 0) {
            goto out;
        }

        if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
            type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
            (type == QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN && !bs->backing) ||
            (type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC && !bs->backing))
        {
            qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
        } else {
            if (!aio && cur_bytes != bytes) {
                aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
            }
            ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
                                 host_offset, offset, cur_bytes,
                                 qiov, qiov_offset, NULL);
            if (ret < 0) {
                goto out;
            }
        }

        bytes -= cur_bytes;
        offset += cur_bytes;
        qiov_offset += cur_bytes;
    }

out:
    if (aio) {
        aio_task_pool_wait_all(aio);
        if (ret == 0) {
            ret = aio_task_pool_status(aio);
        }
        g_free(aio);
    }

    return ret;
}

qcow2_co_preadv_task

/*
 * This function can count as GRAPH_RDLOCK because qcow2_co_preadv_part() holds
 * the graph lock and keeps it until this coroutine has terminated.
 */
static int coroutine_fn GRAPH_RDLOCK qcow2_co_preadv_task_entry(AioTask *task)
{
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);

    assert(!t->l2meta);

    return qcow2_co_preadv_task(t->bs, t->subcluster_type,
                                t->host_offset, t->offset, t->bytes,
                                t->qiov, t->qiov_offset);
}

之后会判断当前读的cluster是什么类型的,如果是压缩的镜像,会走到QCOW2_SUBCLUSTER_COMPRESSED

static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
                     uint64_t host_offset, uint64_t offset, uint64_t bytes,
                     QEMUIOVector *qiov, size_t qiov_offset)
{
    BDRVQcow2State *s = bs->opaque;

    switch (subc_type) {
    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
        /* Both zero types are handled in qcow2_co_preadv_part */
        g_assert_not_reached();

    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
        assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */

        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
        return bdrv_co_preadv_part(bs->backing, offset, bytes,
                                   qiov, qiov_offset, 0);

    case QCOW2_SUBCLUSTER_COMPRESSED:
        return qcow2_co_preadv_compressed(bs, host_offset,
                                          offset, bytes, qiov, qiov_offset);

    case QCOW2_SUBCLUSTER_NORMAL:
        if (bs->encrypted) {
            return qcow2_co_preadv_encrypted(bs, host_offset,
                                             offset, bytes, qiov, qiov_offset);
        }

        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
        return bdrv_co_preadv_part(s->data_file, host_offset,
                                   bytes, qiov, qiov_offset, 0);

    default:
        g_assert_not_reached();
    }

    g_assert_not_reached();
}

qcow2_co_preadv_compressed

static int coroutine_fn GRAPH_RDLOCK
qcow2_co_preadv_compressed(BlockDriverState *bs,
                           uint64_t l2_entry,
                           uint64_t offset,
                           uint64_t bytes,
                           QEMUIOVector *qiov,
                           size_t qiov_offset)
{
    BDRVQcow2State *s = bs->opaque;
    int ret = 0, csize;
    uint64_t coffset;
    uint8_t *buf, *out_buf;
    int offset_in_cluster = offset_into_cluster(s, offset);

    qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);

    buf = g_try_malloc(csize);
    if (!buf) {
        return -ENOMEM;
    }

    out_buf = qemu_blockalign(bs, s->cluster_size);

    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
    if (ret < 0) {
        goto fail;
    }

    if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
        ret = -EIO;
        goto fail;
    }

    qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);

fail:
    qemu_vfree(out_buf);
    g_free(buf);

    return ret;
}

最后调用解压缩

/*
 * qcow2_co_decompress()
 *
 * Decompress some data (not more than @src_size bytes) to produce exactly
 * @dest_size bytes using the compression method defined by the image
 * compression type
 *
 * @dest - destination buffer, @dest_size bytes
 * @src - source buffer, @src_size bytes
 *
 * Returns: 0 on success
 *          a negative error code on failure
 */
ssize_t coroutine_fn
qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
                    const void *src, size_t src_size)
{
    BDRVQcow2State *s = bs->opaque;
    Qcow2CompressFunc fn;

    switch (s->compression_type) {
    case QCOW2_COMPRESSION_TYPE_ZLIB:
        fn = qcow2_zlib_decompress;
        break;

#ifdef CONFIG_ZSTD
    case QCOW2_COMPRESSION_TYPE_ZSTD:
        fn = qcow2_zstd_decompress;
        break;
#endif
    default:
        abort();
    }

    return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn);
}

总结

从上面的流程分析可以看出,当qcow2被压缩后,guest每次需要读取原qcow2文件中的内容时,都会让host后端进行一次解压缩,这会消耗cpu

参考

https://github.com/qemu/qemu

12-23 12:31