From 2debf5f10e58f5e0acc6dfcd382c34b03ab16d24 Mon Sep 17 00:00:00 2001 From: Mark Maybee Date: Mon, 26 Apr 2021 12:27:47 -0600 Subject: [PATCH] VDEV_PROP_NOALLOC plumbing Signed-off-by: Allan Jude --- cmd/zpool/zpool_main.c | 20 ++- include/sys/fs/zfs.h | 2 + include/sys/spa.h | 3 +- include/sys/vdev_impl.h | 1 + module/zfs/vdev.c | 64 ++++++---- module/zfs/vdev_label.c | 4 + module/zfs/vdev_removal.c | 252 ++++++++++++++++++++++++++++++++------ module/zfs/zio.c | 2 +- 8 files changed, 281 insertions(+), 67 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 2dd654e8bc37..71252bee2667 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2431,6 +2431,12 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } + if (vs->vs_scan_removing != 0) { + (void) printf(gettext(" (removing)")); + } else if (vs->vs_noalloc != 0) { + (void) printf(gettext(" (non-allocating)")); + } + /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); @@ -10202,6 +10208,14 @@ set_callback(zpool_handle_t *zhp, void *data) int error; set_cbdata_t *cb = (set_cbdata_t *)data; + if (cb->cb_type == ZFS_TYPE_VDEV) { + error = zpool_set_vdev_prop(zhp, *cb->cb_vdevs.cb_names, + cb->cb_propname, cb->cb_value); + if (!error) + cb->cb_any_successful = B_TRUE; + return (error); + } + /* Check if we have out-of-bounds features */ if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { boolean_t features[SPA_FEATURES]; @@ -10259,11 +10273,7 @@ set_callback(zpool_handle_t *zhp, void *data) } } - if (cb->cb_type == ZFS_TYPE_VDEV) - error = zpool_set_vdev_prop(zhp, *cb->cb_vdevs.cb_names, - cb->cb_propname, cb->cb_value); - else - error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); + error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 85561ea5fb51..8d8229f8d16f 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -774,6 +774,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" +#define ZPOOL_CONFIG_NONALLOCATING "non_allocating" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" @@ -1151,6 +1152,7 @@ typedef struct vdev_stat { uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ + uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 374d36e7327e..df528cf67281 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -792,7 +792,8 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); -extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern int spa_vdev_alloc(spa_t *spa, uint64_t guid); +extern int spa_vdev_noalloc(spa_t *spa, uint64_t guid); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3cfde40a77fe..86959725a513 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -295,6 +295,7 @@ struct vdev { list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ae0a5fc5fae1..18a1d6017db6 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -867,6 +867,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, + &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, @@ -1185,8 +1187,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); + tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; @@ -1202,6 +1206,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; + svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; @@ -1500,11 +1505,11 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* - * If the vdev is being removed we don't activate - * the metaslabs since we want to ensure that no new - * allocations are performed on this device. + * If the vdev is marked as non-allocating then don't + * activate the metaslabs since we want to ensure that + * no allocations are performed on this device. */ - if (!expanding && !vd->vdev_removing) { + if (!expanding && !vd->vdev_noalloc) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); @@ -4440,6 +4445,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } + vs->vs_noalloc = vd->vdev_noalloc; } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -5516,6 +5522,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; + int error; ASSERT(vd != NULL); @@ -5528,7 +5535,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) #if 0 if ((error = vdev_prop_validate(spa, nvprops)) != 0) - return; + return (error); #endif while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { @@ -5538,15 +5545,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) char *strval = NULL; if (prop == VDEV_PROP_INVAL && !vdev_prop_user(propname)) { - intval = EINVAL; - vdev_prop_add_list(outnvl, propname, strval, intval, 0); - continue; + error = EINVAL; + goto end; } - if (vdev_prop_readonly(prop) == B_TRUE) { - intval = EROFS; - vdev_prop_add_list(outnvl, propname, strval, intval, 0); - continue; + if (vdev_prop_readonly(prop)) { + error = EROFS; + goto end; } /* Special Processing */ @@ -5554,28 +5559,41 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_PATH: strval = vd->vdev_path; if (strval == NULL) - intval = EROFS; - if (nvpair_type(elem) != DATA_TYPE_STRING) - intval = EINVAL; - if (intval == 0) - strval = fnvpair_value_string(elem); + error = EROFS; + else if (nvpair_type(elem) != DATA_TYPE_STRING) + error = EINVAL; + if (error != 0) + break; + strval = fnvpair_value_string(elem); if (strval == NULL) - intval = EINVAL; - if (intval != 0) { - vdev_prop_add_list(outnvl, propname, strval, - intval, 0); - continue; - } + error = EINVAL; + if (error != 0) + break; spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(strval); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(vd->vdev_top); spa_config_exit(spa, SCL_CONFIG, FTAG); break; + case VDEV_PROP_NOALLOC: + intval = fnvpair_value_uint64(elem); + if (intval == vd->vdev_noalloc) + return (0); /* noop */ + if (intval == 1) + error = spa_vdev_noalloc(spa, vdev_guid); + else + error = spa_vdev_alloc(spa, vdev_guid); + break; default: /* Most processing is done in vdev_sync_props */ break; } +end: + if (error != 0) { + intval = error; + vdev_prop_add_list(outnvl, propname, strval, intval, 0); + return (error); + } } return (dsl_sync_task(spa->spa_name, NULL, vdev_sync_props, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 04202a9f8960..5a6ece817b47 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -478,6 +478,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); + if (vd->vdev_noalloc) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, + vd->vdev_noalloc); + } if (vd->vdev_removing) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index d7c0641c8c2c..9ed1e6763a44 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -167,6 +167,16 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) return (NULL); } +static void +vdev_activate(vdev_t *vd) +{ + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); + vd->vdev_noalloc = B_FALSE; +} + static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) @@ -1616,6 +1626,44 @@ spa_vdev_remove_suspend(spa_t *spa) mutex_exit(&svr->svr_lock); } +static boolean_t +vdev_prop_noalloc(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + uint64_t objid; + uint64_t noalloc = 0; + int err = 0; + + ASSERT(vd != NULL); + + if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + objid = 0; + } + + /* no vdev property object => no props */ + if (mos == NULL || objid == 0) { + return (B_FALSE); + } + + mutex_enter(&spa->spa_props_lock); + + err = zap_lookup(mos, objid, vdev_prop_to_name(VDEV_PROP_NOALLOC), + sizeof (uint64_t), 1, &noalloc); + + mutex_exit(&spa->spa_props_lock); + + if (err && err == ENOENT) { + return (B_FALSE); + } + + return (noalloc > 0); +} + /* ARGSUSED */ static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) @@ -1758,6 +1806,14 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; + + if (!vdev_prop_noalloc(vd)) { + /* XXX - not sure locking is correct/necessary... */ + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_activate(vd); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", @@ -1771,21 +1827,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) static int spa_vdev_remove_cancel_impl(spa_t *spa) { - uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; - int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); - - if (error == 0) { - spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); - vdev_t *vd = vdev_lookup_top(spa, vdid); - metaslab_group_activate(vd->vdev_mg); - ASSERT(!vd->vdev_islog); - metaslab_group_activate(vd->vdev_log_mg); - spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); - } - return (error); } @@ -2093,6 +2137,110 @@ spa_vdev_remove_top_check(vdev_t *vd) return (0); } +int +spa_vdev_alloc(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + uint64_t txg; + int error = 0; + + ASSERT(!MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_writeable(spa)); + + /* XXX - is this necessary for activate? */ + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (vd == NULL) + error = SET_ERROR(ENOENT); + else if (vd->vdev_mg == NULL) + error = SET_ERROR(ENOTSUP); + else + vdev_activate(vd); + + if (error == 0) { + vdev_dirty_leaves(vd, VDD_DTL, txg); + vdev_config_dirty(vd); + } + + (void) spa_vdev_exit(spa, NULL, txg, error); + + return (error); +} + +static int +vdev_passivate(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *normal = spa_normal_class(spa); + if (mg->mg_class == normal) { + /* + * We must check that this is not the only allocating device in + * the pool before passivating, otherwise we will not be able + * to make progress because we can't allocate from any vdevs. + */ + boolean_t last = B_TRUE; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + + if (cvd == vd || + cvd->vdev_ops == &vdev_indirect_ops) + continue; + + metaslab_class_t *mc = vd->vdev_mg->mg_class; + if (mc != normal) + continue; + + if (!cvd->vdev_noalloc) { + last = B_FALSE; + break; + } + } + if (last) + return (SET_ERROR(EINVAL)); + } + + metaslab_group_passivate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + if (vd->vdev_log_mg != NULL) + metaslab_group_activate(vd->vdev_log_mg); + return (error); + } + + vd->vdev_noalloc = B_TRUE; + + return (0); +} + /* * Initiate removal of a top-level vdev, reducing the total space in the pool. * The config lock is held for the specified TXG. Once initiated, @@ -2105,6 +2253,7 @@ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; + boolean_t set_noalloc = B_FALSE; int error; /* @@ -2113,8 +2262,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * are errors. */ error = spa_vdev_remove_top_check(vd); - if (error != 0) - return (error); /* * Stop allocating from this vdev. Note that we must check @@ -2124,31 +2271,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * The above check for sufficient free space serves this * purpose. */ - metaslab_group_t *mg = vd->vdev_mg; - metaslab_group_passivate(mg); - ASSERT(!vd->vdev_islog); - metaslab_group_passivate(vd->vdev_log_mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + if (error == 0 && !vd->vdev_noalloc) { + set_noalloc = B_TRUE; + error = vdev_passivate(vd, txg); + } - /* - * We must ensure that no "stubby" log blocks are allocated - * on the device to be removed. These blocks could be - * written at any time, including while we are in the middle - * of copying them. - */ - error = spa_reset_logs(spa); + if (error != 0) + return (error); /* * We stop any initializing and TRIM that is currently in progress * but leave the state as "active". This will allow the process to * resume if the removal is canceled sometime later. */ + + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(vd); @@ -2159,13 +2297,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ - if (error == 0) - error = spa_vdev_remove_top_check(vd); + error = spa_vdev_remove_top_check(vd); if (error != 0) { - metaslab_group_activate(mg); - ASSERT(!vd->vdev_islog); - metaslab_group_activate(vd->vdev_log_mg); + if (set_noalloc) + vdev_activate(vd); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); @@ -2184,6 +2320,48 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) return (0); } +/* + * Turn off allocations for a top-level device from the pool. + * + * Turning off allocations for a top-level device can take a significant + * amount of time. As a result we use the spa_vdev_config_[enter/exit] + * functions which allow us to grab and release the spa_config_lock while + * still holding the namespace lock. During each step the configuration + * is synced out. + */ +int +spa_vdev_noalloc(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + uint64_t txg; + int error; + + ASSERT(!MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (vd == NULL) + error = SET_ERROR(ENOENT); + else if (vd->vdev_mg == NULL) + error = SET_ERROR(ENOTSUP); + else + error = vdev_passivate(vd, &txg); + + if (error == 0) { + vdev_dirty_leaves(vd, VDD_DTL, txg); + vdev_config_dirty(vd); + } + + error = spa_vdev_exit(spa, NULL, txg, error); + + return (error); +} + /* * Remove a device from the pool. * diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 87ccb6861850..1d5d5e2093be 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3736,7 +3736,7 @@ zio_vdev_io_start(zio_t *zio) * Note: the code can handle other kinds of writes, * but we don't expect them. */ - if (zio->io_vd->vdev_removing) { + if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));