原文地址:https://lists.gnu.org/archive/html/qemu-devel/2012-06/msg03825.html
[Qemu-devel] [RFC 5/7] qcow2: implement lazy refcounts
From: |
Stefan Hajnoczi |
Subject: |
[Qemu-devel] [RFC 5/7] qcow2: implement lazy refcounts |
Date: |
Fri, 22 Jun 2012 16:08:44 +0100 |
Lazy refcounts is a performance optimization for qcow2 that postpones
refcount metadata updates and instead marks the image dirty. In the
case of crash or power failure the image will be left in a dirty state
and repaired next time it is opened.
Reducing metadata I/O is important for cache=writethrough and
cache=directsync because these modes guarantee that data is on disk
after each write (hence we cannot take advantage of caching updates in
RAM). Refcount metadata is not needed for guest->file block address
translation and therefore does not need to be on-disk at the time of
write completion - this is the motivation behind the lazy refcount
optimization.
The lazy refcount optimization must be enabled at image creation time:
qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=on a.qcow2 10G
qemu-system-x86_64 -drive if=virtio,file=a.qcow2,cache=writethrough
Signed-off-by: Stefan Hajnoczi <address@hidden>
---
block/qcow2-cluster.c | 5 +++-
block/qcow2.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++---
block/qcow2.h | 8 ++++++
block_int.h | 26 ++++++++++---------
4 files changed, 89 insertions(+), 17 deletions(-)
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d7e0e19..e179211 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -662,7 +662,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
QCowL2Meta *m)
qcow2_cache_depends_on_flush(s->l2_table_cache);
}
- qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+ if (qcow2_need_accurate_refcounts(s)) {
+ qcow2_cache_set_dependency(bs, s->l2_table_cache,
+ s->refcount_block_cache);
+ }
ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
if (ret < 0) {
goto err;
diff --git a/block/qcow2.c b/block/qcow2.c
index cc30784..b54955c 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -215,6 +215,40 @@ static void report_unsupported_feature(BlockDriverState
*bs,
}
/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully. Therefore it is not required to check the return
+ * value of this function.
+ */
+static int qcow2_mark_dirty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t val;
+ int ret;
+
+ if (s->incompatible_features & QCOW2_INCOMPATIBLE_FEAT_DIRTY) {
+ return 0; /* already dirty */
+ }
+
+ val = cpu_to_be64(s->incompatible_features |
+ QCOW2_INCOMPATIBLE_FEAT_DIRTY);
+ ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+ &val, sizeof(val));
+ if (ret < 0) {
+ return ret;
+ }
+ ret = bdrv_flush(bs->file);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Only treat image as dirty if the header was updated successfully */
+ s->incompatible_features |= QCOW2_INCOMPATIBLE_FEAT_DIRTY;
+ return 0;
+}
+
+/*
* Clears the dirty bit and flushes before if necessary. Only call this
* function when there are no pending requests, it does not guard against
* concurrent requests dirtying the image.
@@ -755,6 +789,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState
*bs,
goto fail;
}
+ if (l2meta.nb_clusters > 0 &&
+ (s->compatible_features & QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS)) {
+ qcow2_mark_dirty(bs);
+ }
+
cluster_offset = l2meta.cluster_offset;
assert((cluster_offset & 511) == 0);
@@ -1175,6 +1214,11 @@ static int qcow2_create2(const char *filename, int64_t
total_size,
header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
}
+ if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+ header.compatible_features |=
+ cpu_to_be64(QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS);
+ }
+
ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
if (ret < 0) {
goto out;
@@ -1288,6 +1332,8 @@ static int qcow2_create(const char *filename,
QEMUOptionParameter *options)
options->value.s);
return -EINVAL;
}
+ } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+ flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
}
options++;
}
@@ -1298,6 +1344,12 @@ static int qcow2_create(const char *filename,
QEMUOptionParameter *options)
return -EINVAL;
}
+ if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+ fprintf(stderr, "Lazy refcounts only supported with compatibility "
+ "level 1.1 and above (use compat=1.1 or greater)\n");
+ return -EINVAL;
+ }
+
return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
cluster_size, prealloc, options, version);
}
@@ -1484,10 +1536,12 @@ static coroutine_fn int
qcow2_co_flush_to_os(BlockDriverState *bs)
return ret;
}
- ret = qcow2_cache_flush(bs, s->refcount_block_cache);
- if (ret < 0) {
- qemu_co_mutex_unlock(&s->lock);
- return ret;
+ if (qcow2_need_accurate_refcounts(s)) {
+ ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+ if (ret < 0) {
+ qemu_co_mutex_unlock(&s->lock);
+ return ret;
+ }
}
qemu_co_mutex_unlock(&s->lock);
@@ -1602,6 +1656,11 @@ static QEMUOptionParameter qcow2_create_options[] = {
.type = OPT_STRING,
.help = "Preallocation mode (allowed values: off, metadata)"
},
+ {
+ .name = BLOCK_OPT_LAZY_REFCOUNTS,
+ .type = OPT_FLAG,
+ .help = "Postpone refcount updates",
+ },
{ NULL }
};
diff --git a/block/qcow2.h b/block/qcow2.h
index 5c7cfb6..b8c0beb 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -111,6 +111,9 @@ enum {
QCOW2_INCOMPATIBLE_FEAT_DIRTY = 0x1,
QCOW2_INCOMPATIBLE_FEAT_MASK = QCOW2_INCOMPATIBLE_FEAT_DIRTY,
+
+ QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS = 0x1,
+ QCOW2_COMPATIBLE_FEAT_MASK = QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS,
};
typedef struct Qcow2Feature {
@@ -240,6 +243,11 @@ static inline int qcow2_get_cluster_type(uint64_t l2_entry)
}
}
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+ return !(s->incompatible_features & QCOW2_INCOMPATIBLE_FEAT_DIRTY);
+}
// FIXME Need qcow2_ prefix to global functions
diff --git a/block_int.h b/block_int.h
index 1fb5352..733aa71 100644
--- a/block_int.h
+++ b/block_int.h
@@ -31,8 +31,9 @@
#include "qemu-timer.h"
#include "qapi-types.h"
-#define BLOCK_FLAG_ENCRYPT 1
-#define BLOCK_FLAG_COMPAT6 4
+#define BLOCK_FLAG_ENCRYPT 1
+#define BLOCK_FLAG_COMPAT6 4
+#define BLOCK_FLAG_LAZY_REFCOUNTS 8
#define BLOCK_IO_LIMIT_READ 0
#define BLOCK_IO_LIMIT_WRITE 1
@@ -41,16 +42,17 @@
#define BLOCK_IO_SLICE_TIME 100000000
#define NANOSECONDS_PER_SECOND 1000000000.0
-#define BLOCK_OPT_SIZE "size"
-#define BLOCK_OPT_ENCRYPT "encryption"
-#define BLOCK_OPT_COMPAT6 "compat6"
-#define BLOCK_OPT_BACKING_FILE "backing_file"
-#define BLOCK_OPT_BACKING_FMT "backing_fmt"
-#define BLOCK_OPT_CLUSTER_SIZE "cluster_size"
-#define BLOCK_OPT_TABLE_SIZE "table_size"
-#define BLOCK_OPT_PREALLOC "preallocation"
-#define BLOCK_OPT_SUBFMT "subformat"
-#define BLOCK_OPT_COMPAT_LEVEL "compat"
+#define BLOCK_OPT_SIZE "size"
+#define BLOCK_OPT_ENCRYPT "encryption"
+#define BLOCK_OPT_COMPAT6 "compat6"
+#define BLOCK_OPT_BACKING_FILE "backing_file"
+#define BLOCK_OPT_BACKING_FMT "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE "table_size"
+#define BLOCK_OPT_PREALLOC "preallocation"
+#define BLOCK_OPT_SUBFMT "subformat"
+#define BLOCK_OPT_COMPAT_LEVEL "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts"
typedef struct BdrvTrackedRequest BdrvTrackedRequest;
--
1.7.10