Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial version of time dependent geometry #9

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -686,8 +686,8 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
Expand Down
8 changes: 8 additions & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ extern "C" {
struct zio;
struct raidz_row;
struct raidz_map;
struct vdev_raidz;
#if !defined(_KERNEL)
struct kernel_param {};
#endif
Expand All @@ -47,6 +48,7 @@ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
void vdev_raidz_map_free(struct raidz_map *);
void vdev_raidz_free(struct vdev_raidz *);
void vdev_raidz_generate_parity(struct raidz_map *);
void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);

Expand Down Expand Up @@ -95,6 +97,12 @@ typedef struct vdev_raidz {
int vd_logical_width;
int vd_physical_width;
int vd_nparity;

/*
* Last reflow txg per attached device.
*/
avl_tree_t vre_txgs;

/*
* If this vdev is being expanded, spa_raidz_expand is set to this
*/
Expand Down
7 changes: 3 additions & 4 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>

Expand Down Expand Up @@ -909,10 +910,8 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

if (vd->vdev_ops == &vdev_raidz_ops) {
vdev_raidz_t *rz = vd->vdev_tsd;
kmem_free(rz, sizeof (*rz));
}
if (vd->vdev_ops == &vdev_raidz_ops)
vdev_raidz_free(vd->vdev_tsd);

/*
* Discard allocation state.
Expand Down
149 changes: 124 additions & 25 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,41 @@ vdev_raidz_map_free_vsd(zio_t *zio)
}
}

/*
* The node of avl tree below is main part of expanded raidz time dependent
* geometry logic. Allowing to mix BPs stored in reflowed and normal format
* depending of BP blk birth txg. The new reflow node is added on first
* expansion process start and every time on expansion reflow process completion.
* Where re_txg is last reflow process txg and the re_logical_width is actual
* logical width required to read BP in reflowed format.
*/
typedef struct reflow_node {
uint64_t re_txg;
uint64_t re_logical_width;
ahrens marked this conversation as resolved.
Show resolved Hide resolved
avl_node_t re_link;
} reflow_node_t;

static int
vedv_raidz_reflow_compare(const void *x1, const void *x2)
{
const reflow_node_t *l = (reflow_node_t *)x1;
const reflow_node_t *r = (reflow_node_t *)x2;

return (TREE_CMP(l->re_txg, r->re_txg));
}

void
vdev_raidz_free(vdev_raidz_t *vdrz)
{
reflow_node_t *re;
void *cookie = NULL;
avl_tree_t *tree = &vdrz->vre_txgs;
while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
kmem_free(re, sizeof (*re));
avl_destroy(&vdrz->vre_txgs);
kmem_free(vdrz, sizeof (*vdrz));
}

/*ARGSUSED*/
static void
vdev_raidz_cksum_free(void *arg, size_t ignored)
Expand Down Expand Up @@ -2010,6 +2045,20 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
}
}

static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, zio_t *zio)
{
reflow_node_t *re, lookup = { BP_PHYSICAL_BIRTH(zio->io_bp), 0 };
avl_index_t where;

re = avl_find(&vdrz->vre_txgs, &lookup, &where);
if (re != NULL)
return (re->re_logical_width);

re = avl_nearest(&vdrz->vre_txgs, where, AVL_BEFORE);
return (re->re_logical_width);
}

/*
* Start an IO operation on a RAIDZ VDev
*
Expand All @@ -2036,22 +2085,30 @@ vdev_raidz_io_start(zio_t *zio)
raidz_map_t *rm;

if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
/* XXX rangelock not needed after expansion completes */
zfs_locked_range_t *lr =
zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
zio->io_offset, zio->io_size, RL_READER);

rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
zio->io_size, zio->io_offset,
tvd->vdev_ashift, vdrz->vd_physical_width,
vdrz->vd_logical_width, vdrz->vd_nparity,
vdrz->vn_vre.vre_offset_phys);
rm->rm_lr = lr;
/*
* XXX If this is a write, will need to do additional
* writes to locations that are already copied, but
* not yet reflected in the on-disk format.
*/
uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, zio);
if (vdrz->vn_vre.vre_offset != UINT64_MAX ||
logical_width != vdrz->vd_physical_width) {
/* XXX rangelock not needed after expansion completes */
zfs_locked_range_t *lr =
zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
zio->io_offset, zio->io_size, RL_READER);

rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
zio->io_size, zio->io_offset,
tvd->vdev_ashift, vdrz->vd_physical_width,
logical_width, vdrz->vd_nparity,
vdrz->vn_vre.vre_offset_phys);
rm->rm_lr = lr;
/*
* XXX If this is a write, will need to do additional
* writes to locations that are already copied, but
* not yet reflected in the on-disk format.
*/
} else {
rm = vdev_raidz_map_alloc(zio,
tvd->vdev_ashift, vdrz->vd_physical_width,
vdrz->vd_nparity);
}
} else {
rm = vdev_raidz_map_alloc(zio,
tvd->vdev_ashift, vdrz->vd_logical_width,
Expand Down Expand Up @@ -2977,12 +3034,19 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = arg;
vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
vdev_raidz_t *vdrz = raidvd->vdev_tsd;

for (int i = 0; i < TXG_SIZE; i++)
ASSERT0(vre->vre_offset_pertxg[i]);

vre->vre_offset_phys = UINT64_MAX;

reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
re->re_txg = tx->tx_txg;
re->re_logical_width = vdrz->vd_physical_width;
avl_add(&vdrz->vre_txgs, re);

/*
* vre_offset_phys will be removed from the on-disk config by
* vdev_raidz_config_generate().
Expand Down Expand Up @@ -3337,6 +3401,13 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
new_child);

if (vdrz->vd_logical_width == vdrz->vd_physical_width) {
reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
re->re_txg = 0;
re->re_logical_width = vdrz->vd_logical_width;
avl_add(&vdrz->vre_txgs, re);
}

vdrz->vd_physical_width++;

vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
Expand Down Expand Up @@ -3396,12 +3467,27 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
* it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
vdrz->vd_logical_width);
if (vdrz->vn_vre.vre_offset_phys != UINT64_MAX) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET,
vdrz->vn_vre.vre_offset_phys);
}

if (!avl_is_empty(&vdrz->vre_txgs)) {
uint64_t i = 0, count = avl_numnodes(&vdrz->vre_txgs);
uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
KM_SLEEP);

for (reflow_node_t *re =
avl_first(&vdrz->vre_txgs); re;
re = AVL_NEXT(&vdrz->vre_txgs, re)) {
txgs[i++] = re->re_txg;
}

fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
txgs, count);

kmem_free(txgs, sizeof (uint64_t) * count);
}
}

/*
Expand All @@ -3412,7 +3498,8 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
void *
vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
{
uint64_t nparity, lw;
uint64_t nparity, *txgs;
uint_t txgs_size;
vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);

vdrz->vn_vre.vre_vdev_id = -1;
Expand All @@ -3432,11 +3519,6 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
vdrz->vd_logical_width = children;
vdrz->vd_physical_width = children;

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
&lw) == 0) {
vdrz->vd_logical_width = lw;
}

/* note, the ID does not exist when creating a pool */
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&vdrz->vn_vre.vre_vdev_id);
Expand All @@ -3450,6 +3532,23 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
*/
}

avl_create(&vdrz->vre_txgs, vedv_raidz_reflow_compare,
sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));

error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
&txgs, &txgs_size);
if (error == 0) {
for (int i = 0; i < txgs_size; i++) {
reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
re->re_txg = txgs[txgs_size - i - 1];
re->re_logical_width = vdrz->vd_physical_width - i;
avl_add(&vdrz->vre_txgs, re);
}

reflow_node_t *re = avl_first(&vdrz->vre_txgs);
vdrz->vd_logical_width = re->re_logical_width;
}

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
Expand Down Expand Up @@ -3479,7 +3578,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
vdrz->vd_nparity = nparity;
return (vdrz);
out:
kmem_free(vdrz, sizeof (*vdrz));
vdev_raidz_free(vdrz);
return (NULL);
}

Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -708,8 +708,9 @@ tags = ['functional', 'redacted_send']

[tests/functional/raidz]
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos',
'raidz_expand.ksh']
'raidz_expand_001_pos', 'raidz_expand_002_pos']
tags = ['functional', 'raidz']
timeout = 1200

[tests/functional/redundancy]
tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos',
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/raidz/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ dist_pkgdata_SCRIPTS = \
raidz_002_pos.ksh \
raidz_003_pos.ksh \
raidz_004_pos.ksh \
raidz_expand.ksh
raidz_expand_001_pos.ksh \
raidz_expand_002_pos.ksh
Loading