Skip to content

Commit 1085623

Browse files
authored
Improve allocation fallback handling
Before this change in case of any allocation error ZFS always fallen back to normal class. But with more of different classes available we migth want more sophisticated logic. For example, it makes sense to fall back from dedup first to special class (if it is allowed to put DDT there) and only then to normal, since in a pool with dedup and special classes populated normal class likely has performance characteristics unsuitable for dedup. This change implements general mechanism where fallback order is controlled by the same spa_preferred_class() as the initial class selection. And as first application it implements the mentioned dedup->special->normal fallbacks. I have more plans for it later. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #17391
1 parent e0edfcb commit 1085623

File tree

4 files changed

+75
-62
lines changed

4 files changed

+75
-62
lines changed

include/sys/spa.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,7 +1116,9 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
11161116
extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
11171117
extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
11181118
extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
1119+
extern boolean_t spa_has_dedup(spa_t *spa);
11191120
extern boolean_t spa_has_slogs(spa_t *spa);
1121+
extern boolean_t spa_has_special(spa_t *spa);
11201122
extern boolean_t spa_is_root(spa_t *spa);
11211123
extern boolean_t spa_writeable(spa_t *spa);
11221124
extern boolean_t spa_has_pending_synctask(spa_t *spa);

module/zfs/ddt.c

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,29 +1037,18 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
10371037
ddt_free(ddt, dde);
10381038
}
10391039

1040+
/*
1041+
* We're considered over quota when we hit 85% full, or for larger drives,
1042+
* when there is less than 8GB free.
1043+
*/
10401044
static boolean_t
1041-
ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
1042-
{
1043-
if (mc != NULL && metaslab_class_get_space(mc) > 0) {
1044-
/* Over quota if allocating outside of this special class */
1045-
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1046-
dedup_class_wait_txgs) {
1047-
/* Waiting for some deferred frees to be processed */
1048-
return (B_TRUE);
1049-
}
1050-
1051-
/*
1052-
* We're considered over quota when we hit 85% full, or for
1053-
* larger drives, when there is less than 8GB free.
1054-
*/
1055-
uint64_t allocated = metaslab_class_get_alloc(mc);
1056-
uint64_t capacity = metaslab_class_get_space(mc);
1057-
uint64_t limit = MAX(capacity * 85 / 100,
1058-
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1059-
1060-
return (allocated >= limit);
1061-
}
1062-
return (B_FALSE);
1045+
ddt_special_over_quota(metaslab_class_t *mc)
1046+
{
1047+
uint64_t allocated = metaslab_class_get_alloc(mc);
1048+
uint64_t capacity = metaslab_class_get_space(mc);
1049+
uint64_t limit = MAX(capacity * 85 / 100,
1050+
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1051+
return (allocated >= limit);
10631052
}
10641053

10651054
/*
@@ -1082,13 +1071,21 @@ ddt_over_quota(spa_t *spa)
10821071
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
10831072

10841073
/*
1085-
* For automatic quota, table size is limited by dedup or special class
1074+
* Over quota if have to allocate outside of the dedup/special class.
10861075
*/
1087-
if (ddt_special_over_quota(spa, spa_dedup_class(spa)))
1088-
return (B_TRUE);
1089-
else if (spa_special_has_ddt(spa) &&
1090-
ddt_special_over_quota(spa, spa_special_class(spa)))
1076+
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1077+
dedup_class_wait_txgs) {
1078+
/* Waiting for some deferred frees to be processed */
10911079
return (B_TRUE);
1080+
}
1081+
1082+
/*
1083+
* For automatic quota, table size is limited by dedup or special class
1084+
*/
1085+
if (spa_has_dedup(spa))
1086+
return (ddt_special_over_quota(spa_dedup_class(spa)));
1087+
else if (spa_special_has_ddt(spa))
1088+
return (ddt_special_over_quota(spa_special_class(spa)));
10921089

10931090
return (B_FALSE);
10941091
}

module/zfs/spa_misc.c

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,8 +2009,7 @@ spa_dedup_class(spa_t *spa)
20092009
boolean_t
20102010
spa_special_has_ddt(spa_t *spa)
20112011
{
2012-
return (zfs_ddt_data_is_special &&
2013-
spa->spa_special_class->mc_groups != 0);
2012+
return (zfs_ddt_data_is_special && spa_has_special(spa));
20142013
}
20152014

20162015
/*
@@ -2019,6 +2018,9 @@ spa_special_has_ddt(spa_t *spa)
20192018
metaslab_class_t *
20202019
spa_preferred_class(spa_t *spa, const zio_t *zio)
20212020
{
2021+
metaslab_class_t *mc = zio->io_metaslab_class;
2022+
boolean_t tried_dedup = (mc == spa_dedup_class(spa));
2023+
boolean_t tried_special = (mc == spa_special_class(spa));
20222024
const zio_prop_t *zp = &zio->io_prop;
20232025

20242026
/*
@@ -2036,12 +2038,10 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
20362038
*/
20372039
ASSERT(objtype != DMU_OT_INTENT_LOG);
20382040

2039-
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
2040-
20412041
if (DMU_OT_IS_DDT(objtype)) {
2042-
if (spa->spa_dedup_class->mc_groups != 0)
2042+
if (spa_has_dedup(spa) && !tried_dedup && !tried_special)
20432043
return (spa_dedup_class(spa));
2044-
else if (has_special_class && zfs_ddt_data_is_special)
2044+
else if (spa_special_has_ddt(spa) && !tried_special)
20452045
return (spa_special_class(spa));
20462046
else
20472047
return (spa_normal_class(spa));
@@ -2050,14 +2050,15 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
20502050
/* Indirect blocks for user data can land in special if allowed */
20512051
if (zp->zp_level > 0 &&
20522052
(DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
2053-
if (has_special_class && zfs_user_indirect_is_special)
2053+
if (zfs_user_indirect_is_special && spa_has_special(spa) &&
2054+
!tried_special)
20542055
return (spa_special_class(spa));
20552056
else
20562057
return (spa_normal_class(spa));
20572058
}
20582059

20592060
if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) {
2060-
if (has_special_class)
2061+
if (spa_has_special(spa) && !tried_special)
20612062
return (spa_special_class(spa));
20622063
else
20632064
return (spa_normal_class(spa));
@@ -2069,7 +2070,8 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
20692070
* zfs_special_class_metadata_reserve_pct exclusively for metadata.
20702071
*/
20712072
if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) &&
2072-
has_special_class && zio->io_size <= zp->zp_zpl_smallblk) {
2073+
spa_has_special(spa) && !tried_special &&
2074+
zio->io_size <= zp->zp_zpl_smallblk) {
20732075
metaslab_class_t *special = spa_special_class(spa);
20742076
uint64_t alloc = metaslab_class_get_alloc(special);
20752077
uint64_t space = metaslab_class_get_space(special);
@@ -2640,6 +2642,12 @@ spa_fini(void)
26402642
mutex_destroy(&spa_l2cache_lock);
26412643
}
26422644

2645+
boolean_t
2646+
spa_has_dedup(spa_t *spa)
2647+
{
2648+
return (spa->spa_dedup_class->mc_groups != 0);
2649+
}
2650+
26432651
/*
26442652
* Return whether this pool has a dedicated slog device. No locking needed.
26452653
* It's not a problem if the wrong answer is returned as it's only for
@@ -2651,6 +2659,12 @@ spa_has_slogs(spa_t *spa)
26512659
return (spa->spa_log_class->mc_groups != 0);
26522660
}
26532661

2662+
boolean_t
2663+
spa_has_special(spa_t *spa)
2664+
{
2665+
return (spa->spa_special_class->mc_groups != 0);
2666+
}
2667+
26542668
spa_log_state_t
26552669
spa_get_log_state(spa_t *spa)
26562670
{

module/zfs/zio.c

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4150,7 +4150,7 @@ static zio_t *
41504150
zio_dva_allocate(zio_t *zio)
41514151
{
41524152
spa_t *spa = zio->io_spa;
4153-
metaslab_class_t *mc;
4153+
metaslab_class_t *mc, *newmc;
41544154
blkptr_t *bp = zio->io_bp;
41554155
int error;
41564156
int flags = 0;
@@ -4193,7 +4193,7 @@ zio_dva_allocate(zio_t *zio)
41934193
again:
41944194
/*
41954195
* Try allocating the block in the usual metaslab class.
4196-
* If that's full, allocate it in the normal class.
4196+
* If that's full, allocate it in some other class(es).
41974197
* If that's full, allocate as a gang block,
41984198
* and if all are full, the allocation fails (which shouldn't happen).
41994199
*
@@ -4208,29 +4208,29 @@ zio_dva_allocate(zio_t *zio)
42084208
&zio->io_alloc_list, zio->io_allocator, zio);
42094209

42104210
/*
4211-
* Fallback to normal class when an alloc class is full
4211+
* When the dedup or special class is spilling into the normal class,
4212+
* there can still be significant space available due to deferred
4213+
* frees that are in-flight. We track the txg when this occurred and
4214+
* back off adding new DDT entries for a few txgs to allow the free
4215+
* blocks to be processed.
42124216
*/
4213-
if (error == ENOSPC && mc != spa_normal_class(spa)) {
4214-
/*
4215-
* When the dedup or special class is spilling into the normal
4216-
* class, there can still be significant space available due
4217-
* to deferred frees that are in-flight. We track the txg when
4218-
* this occurred and back off adding new DDT entries for a few
4219-
* txgs to allow the free blocks to be processed.
4220-
*/
4221-
if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
4222-
mc == spa_special_class(spa))) &&
4223-
spa->spa_dedup_class_full_txg != zio->io_txg) {
4224-
spa->spa_dedup_class_full_txg = zio->io_txg;
4225-
zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
4226-
"%llu allocated of %llu",
4227-
spa_name(spa), (int)zio->io_txg,
4228-
mc == spa_dedup_class(spa) ? "dedup" : "special",
4229-
(int)zio->io_size,
4230-
(u_longlong_t)metaslab_class_get_alloc(mc),
4231-
(u_longlong_t)metaslab_class_get_space(mc));
4232-
}
4217+
if (error == ENOSPC && spa->spa_dedup_class_full_txg != zio->io_txg &&
4218+
(mc == spa_dedup_class(spa) || (mc == spa_special_class(spa) &&
4219+
!spa_has_dedup(spa) && spa_special_has_ddt(spa)))) {
4220+
spa->spa_dedup_class_full_txg = zio->io_txg;
4221+
zfs_dbgmsg("%s[%llu]: %s class spilling, req size %llu, "
4222+
"%llu allocated of %llu",
4223+
spa_name(spa), (u_longlong_t)zio->io_txg,
4224+
mc == spa_dedup_class(spa) ? "dedup" : "special",
4225+
(u_longlong_t)zio->io_size,
4226+
(u_longlong_t)metaslab_class_get_alloc(mc),
4227+
(u_longlong_t)metaslab_class_get_space(mc));
4228+
}
42334229

4230+
/*
4231+
* Fall back to some other class when this one is full.
4232+
*/
4233+
if (error == ENOSPC && (newmc = spa_preferred_class(spa, zio)) != mc) {
42344234
/*
42354235
* If we are holding old class reservation, drop it.
42364236
* Dispatch the next ZIO(s) there if some are waiting.
@@ -4246,15 +4246,15 @@ zio_dva_allocate(zio_t *zio)
42464246

42474247
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
42484248
zfs_dbgmsg("%s: metaslab allocation failure, "
4249-
"trying normal class: zio %px, size %llu, error %d",
4249+
"trying fallback: zio %px, size %llu, error %d",
42504250
spa_name(spa), zio, (u_longlong_t)zio->io_size,
42514251
error);
42524252
}
4253-
zio->io_metaslab_class = mc = spa_normal_class(spa);
4253+
zio->io_metaslab_class = mc = newmc;
42544254
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
42554255

42564256
/*
4257-
* If normal class uses throttling, return to that pipeline
4257+
* If the new class uses throttling, return to that pipeline
42584258
* stage. Otherwise just do another allocation attempt.
42594259
*/
42604260
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&

0 commit comments

Comments
 (0)