diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 9c570aca176d..3e7c6ac701e8 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -686,8 +686,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" -#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width" #define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 146dd3c29660..81ac4c7c7c93 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -35,6 +35,7 @@ extern "C" { struct zio; struct raidz_row; struct raidz_map; +struct vdev_raidz; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -47,6 +48,7 @@ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); @@ -95,6 +97,12 @@ typedef struct vdev_raidz { int vd_logical_width; int vd_physical_width; int vd_nparity; + + /* + * Last reflow txg per attached device. + */ + avl_tree_t vre_txgs; + /* * If this vdev is being expanded, spa_raidz_expand is set to this */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8edd786331ff..dbcf652c1186 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -909,10 +910,8 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); - if (vd->vdev_ops == &vdev_raidz_ops) { - vdev_raidz_t *rz = vd->vdev_tsd; - kmem_free(rz, sizeof (*rz)); - } + if (vd->vdev_ops == &vdev_raidz_ops) + vdev_raidz_free(vd->vdev_tsd); /* * Discard allocation state. diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index e1b1950f04ec..43556530c189 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -196,6 +196,41 @@ vdev_raidz_map_free_vsd(zio_t *zio) } } +/* + * The node of avl tree below is main part of expanded raidz time dependent + * geometry logic. Allowing to mix BPs stored in reflowed and normal format + * depending of BP blk birth txg. The new reflow node is added on first + * expansion process start and every time on expansion reflow process completion. + * Where re_txg is last reflow process txg and the re_logical_width is actual + * logical width required to read BP in reflowed format. + */ +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + +static int +vedv_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = (reflow_node_t *)x1; + const reflow_node_t *r = (reflow_node_t *)x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + +void +vdev_raidz_free(vdev_raidz_t *vdrz) +{ + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vre_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vre_txgs); + kmem_free(vdrz, sizeof (*vdrz)); +} + /*ARGSUSED*/ static void vdev_raidz_cksum_free(void *arg, size_t ignored) @@ -2010,6 +2045,20 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) } } +static uint64_t +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, zio_t *zio) +{ + reflow_node_t *re, lookup = { BP_PHYSICAL_BIRTH(zio->io_bp), 0 }; + avl_index_t where; + + re = avl_find(&vdrz->vre_txgs, &lookup, &where); + if (re != NULL) + return (re->re_logical_width); + + re = avl_nearest(&vdrz->vre_txgs, where, AVL_BEFORE); + return (re->re_logical_width); +} + /* * Start an IO operation on a RAIDZ VDev * @@ -2036,22 +2085,30 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; if (vdrz->vd_logical_width != vdrz->vd_physical_width) { - /* XXX rangelock not needed after expansion completes */ - zfs_locked_range_t *lr = - zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, - zio->io_offset, zio->io_size, RL_READER); - - rm = vdev_raidz_map_alloc_expanded(zio->io_abd, - zio->io_size, zio->io_offset, - tvd->vdev_ashift, vdrz->vd_physical_width, - vdrz->vd_logical_width, vdrz->vd_nparity, - vdrz->vn_vre.vre_offset_phys); - rm->rm_lr = lr; - /* - * XXX If this is a write, will need to do additional - * writes to locations that are already copied, but - * not yet reflected in the on-disk format. - */ + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, zio); + if (vdrz->vn_vre.vre_offset != UINT64_MAX || + logical_width != vdrz->vd_physical_width) { + /* XXX rangelock not needed after expansion completes */ + zfs_locked_range_t *lr = + zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + + rm = vdev_raidz_map_alloc_expanded(zio->io_abd, + zio->io_size, zio->io_offset, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + vdrz->vn_vre.vre_offset_phys); + rm->rm_lr = lr; + /* + * XXX If this is a write, will need to do additional + * writes to locations that are already copied, but + * not yet reflected in the on-disk format. + */ + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + vdrz->vd_nparity); + } } else { rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vdrz->vd_logical_width, @@ -2977,12 +3034,19 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; for (int i = 0; i < TXG_SIZE; i++) ASSERT0(vre->vre_offset_pertxg[i]); vre->vre_offset_phys = UINT64_MAX; + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg; + re->re_logical_width = vdrz->vd_physical_width; + avl_add(&vdrz->vre_txgs, re); + /* * vre_offset_phys will be removed from the on-disk config by * vdev_raidz_config_generate(). @@ -3337,6 +3401,13 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, new_child); + if (vdrz->vd_logical_width == vdrz->vd_physical_width) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = 0; + re->re_logical_width = vdrz->vd_logical_width; + avl_add(&vdrz->vre_txgs, re); + } + vdrz->vd_physical_width++; vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; @@ -3396,12 +3467,27 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, - vdrz->vd_logical_width); if (vdrz->vn_vre.vre_offset_phys != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET, vdrz->vn_vre.vre_offset_phys); } + + if (!avl_is_empty(&vdrz->vre_txgs)) { + uint64_t i = 0, count = avl_numnodes(&vdrz->vre_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + + for (reflow_node_t *re = + avl_first(&vdrz->vre_txgs); re; + re = AVL_NEXT(&vdrz->vre_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } } /* @@ -3412,7 +3498,8 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) void * vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) { - uint64_t nparity, lw; + uint64_t nparity, *txgs; + uint_t txgs_size; vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vn_vre.vre_vdev_id = -1; @@ -3432,11 +3519,6 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) vdrz->vd_logical_width = children; vdrz->vd_physical_width = children; - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, - &lw) == 0) { - vdrz->vd_logical_width = lw; - } - /* note, the ID does not exist when creating a pool */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &vdrz->vn_vre.vre_vdev_id); @@ -3450,6 +3532,23 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) */ } + avl_create(&vdrz->vre_txgs, vedv_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + avl_add(&vdrz->vre_txgs, re); + } + + reflow_node_t *re = avl_first(&vdrz->vre_txgs); + vdrz->vd_logical_width = re->re_logical_width; + } + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) @@ -3479,7 +3578,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) vdrz->vd_nparity = nparity; return (vdrz); out: - kmem_free(vdrz, sizeof (*vdrz)); + vdev_raidz_free(vdrz); return (NULL); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b2df6d70444e..a54e5772fa00 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -708,8 +708,9 @@ tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos', - 'raidz_expand.ksh'] + 'raidz_expand_001_pos', 'raidz_expand_002_pos'] tags = ['functional', 'raidz'] +timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', diff --git a/tests/zfs-tests/tests/functional/raidz/Makefile.am b/tests/zfs-tests/tests/functional/raidz/Makefile.am index 1068a35eb3d4..bf50764db1ca 100644 --- a/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -6,4 +6,5 @@ dist_pkgdata_SCRIPTS = \ raidz_002_pos.ksh \ raidz_003_pos.ksh \ raidz_004_pos.ksh \ - raidz_expand.ksh + raidz_expand_001_pos.ksh \ + raidz_expand_002_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/raidz/raidz_expand.ksh rename to tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh new file mode 100755 index 000000000000..d760cb3c0dd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -0,0 +1,123 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - for each free test block device +# - attach to the pool +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +function wait_expand_completion +{ + while zpool status $TESTPOOL | grep 'raidz expand:' | \ + grep 'in progress'; do + sleep 1 + done +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + log_must zfs set primarycache=metadata $pool + + log_must zfs create $pool/fs + log_must fill_fs /$pool/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $pool/fs2 + log_must fill_fs /$pool/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 + log_must fill_fs /$pool/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $pool) + + for disk in ${disks[$(($nparity+1))+1..$devs]}; do + log_must zpool attach $pool ${raid}-0 $disk + + wait_expand_completion + + log_must zpool export $pool + log_must zpool import $opts -d $dir $pool + + typeset disk_attached=$(get_disklist $pool | grep $disk) + if [[ -z $disk_attached ]]; then + log_fail "pool $pool attached disk not found" + fi + + typeset expand_size=$(get_pool_prop size $pool) + if [[ "$expand_size" -le "$pool_size" ]]; then + log_fail "pool $pool not expanded" + fi + + pool_size=$expand_size + done + + zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." \ No newline at end of file