Skip to content

Commit c2d1129

Browse files
authored
DAOS-15993 rebuild: for manual rebuilds do not eval self_heal (#17345)
Consider a quick maintenance scenario in which a daos_engine is stopped briefly, and the administrator does not wish to have the DAOS automatic recovery / rebuild mechanism occur. That is, a pool map update (targets from UP_IN to DOWN) is to occur, the pool to enter a degraded mode (still allowing ongoing I/O), and NO rebuild to be triggered during this brief time window. The above can be arranged by modifying the system or pool-specific self_heal property value (to not set the rebuild bit), and then stopping the engine. Now also consider the conclusion of the maintenance that involes re-starting the engine, and reintegrating that rank back into the pool. It is most convenient to directly issue a dmg pool reintegrate command from the maintenance state. Before this change, manual administration commands such as dmg pool exclude/reintegrate were prevented from triggering rebuilds due to the pool self_heal property setting. However, the intention of the self_heal (aka auto recovery) feature is to only apply to automatic rebuilds. With this change, the is_pool_rebuild_allowed() function is updated to accept an indication of whether the self_heal checks are applicable. Manual pool map update and rebuild cases supply false for this argument (allowing those cases to result in a rebuild being scheduled). Signed-off-by: Kenneth Cain <kenneth.cain@hpe.com>
1 parent 22397f3 commit c2d1129

File tree

6 files changed

+59
-33
lines changed

6 files changed

+59
-33
lines changed

src/include/daos_srv/pool.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -567,18 +567,19 @@ int
567567
ds_pool_prop_recov_cont_reset(struct rdb_tx *tx, struct ds_rsvc *rsvc);
568568

569569
static inline bool
570-
is_pool_rebuild_allowed(struct ds_pool *pool, bool check_delayed_rebuild)
570+
is_pool_rebuild_allowed(struct ds_pool *pool, uint64_t self_heal, bool auto_recovery)
571571
{
572-
uint64_t flags = DAOS_SELF_HEAL_AUTO_REBUILD;
573-
574-
if (check_delayed_rebuild)
575-
flags |= DAOS_SELF_HEAL_DELAY_REBUILD;
572+
bool auto_rebuild_enabled = self_heal & DAOS_SELF_HEAL_AUTO_REBUILD;
573+
bool delay_rebuild_enabled = self_heal & DAOS_SELF_HEAL_DELAY_REBUILD;
576574

577575
if (pool->sp_disable_rebuild)
578576
return false;
579-
if (!(pool->sp_self_heal & flags))
577+
578+
/* If auto recovery is requested, only allow if self_heal enables auto or delay_rebuild */
579+
if (auto_recovery && !(auto_rebuild_enabled || delay_rebuild_enabled))
580580
return false;
581581

582+
/* Otherwise, rebuild is allowed */
582583
return true;
583584
}
584585

src/include/daos_srv/rebuild.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* (C) Copyright 2017-2023 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -96,7 +96,7 @@ void ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *rebuild_
9696
daos_epoch_t *current_eph, uint32_t *rebuild_gen);
9797
int
9898
ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys_self_heal,
99-
uint64_t delay_sec);
99+
bool auto_recovery, uint64_t delay_sec);
100100
void ds_rebuild_leader_stop_all(void);
101101
void ds_rebuild_abort(uuid_t pool_uuid, unsigned int version, uint32_t rebuild_gen,
102102
uint64_t term);

src/pool/srv_pool.c

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* (C) Copyright 2016-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
* (C) Copyright 2025 Google LLC
55
*
66
* SPDX-License-Identifier: BSD-2-Clause-Patent
@@ -2577,7 +2577,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
25772577
if (rc != 0)
25782578
goto out;
25792579

2580-
rc = ds_rebuild_regenerate_task(svc->ps_pool, prop, sys_self_heal, 0);
2580+
rc = ds_rebuild_regenerate_task(svc->ps_pool, prop, sys_self_heal, true /* auto_recovery */,
2581+
0 /* delay_sec */);
25812582
if (rc != 0)
25822583
goto out;
25832584

@@ -7746,23 +7747,24 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
77467747
struct pool_target_addr_list *inval_list_out, uint32_t *map_version,
77477748
struct rsvc_hint *hint, enum map_update_source src, uint32_t flags)
77487749
{
7749-
struct pool_target_id_list target_list = {0};
7750-
uint32_t tgt_map_ver = 0;
7751-
bool updated;
7752-
int rc;
7753-
char *env;
7754-
daos_epoch_t rebuild_eph = d_hlc_get();
7755-
uint64_t delay = 2;
7756-
bool sys_self_heal_applicable;
7757-
uint64_t sys_self_heal = 0;
7750+
struct pool_target_id_list target_list = {0};
7751+
uint32_t tgt_map_ver = 0;
7752+
bool updated;
7753+
int rc;
7754+
char *env;
7755+
daos_epoch_t rebuild_eph = d_hlc_get();
7756+
uint64_t delay = 2;
7757+
bool auto_recovery;
7758+
uint64_t sys_self_heal = 0;
77587759

77597760
/*
7760-
* The system self-heal policy only applies to automatic pool exclude
7761+
* The pool and system self-heal policies only apply to automatic pool exclude
77617762
* and rebuild operations.
77627763
*/
7763-
sys_self_heal_applicable = (opc == MAP_EXCLUDE && src == MUS_SWIM);
7764+
auto_recovery = (opc == MAP_EXCLUDE && src == MUS_SWIM);
77647765

7765-
if (sys_self_heal_applicable) {
7766+
/* If applicable, check system self-heal policy. */
7767+
if (auto_recovery) {
77667768
rc = ds_mgmt_get_self_heal_policy(pool_svc_abort_gshp, svc, &sys_self_heal);
77677769
if (rc != 0) {
77687770
DL_ERROR(rc, DF_UUID ": failed to get self-heal policy",
@@ -7784,6 +7786,7 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
77847786
}
77857787
}
77867788

7789+
/* Pool self-heal policy is checked in this call. */
77877790
rc = pool_svc_update_map_internal(svc, opc, exclude_rank, extend_rank_list,
77887791
extend_domains_nr, extend_domains, &target_list, list,
77897792
hint, &updated, map_version, &tgt_map_ver, inval_list_out,
@@ -7804,14 +7807,14 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
78047807
}
78057808
d_freeenv_str(&env);
78067809

7807-
if (sys_self_heal_applicable && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) {
7810+
if (auto_recovery && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) {
78087811
D_DEBUG(DB_MD, DF_UUID ": pool_rebuild disabled in system property self_heal\n",
78097812
DP_UUID(svc->ps_uuid));
78107813
rc = 0;
78117814
goto out;
78127815
}
78137816

7814-
if (!is_pool_rebuild_allowed(svc->ps_pool, true)) {
7817+
if (!is_pool_rebuild_allowed(svc->ps_pool, svc->ps_pool->sp_self_heal, auto_recovery)) {
78157818
D_DEBUG(DB_MD, DF_UUID ": rebuild disabled for pool\n",
78167819
DP_UUID(svc->ps_pool->sp_uuid));
78177820
D_GOTO(out, rc);

src/rebuild/srv.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2614,19 +2614,20 @@ regenerate_task_of_type(struct ds_pool *pool, pool_comp_state_t match_states, ui
26142614
return rc;
26152615
}
26162616

2617-
2618-
/* Regenerate the rebuild tasks when changing the leader. */
2617+
/* Regenerate rebuild tasks when changing the leader, or manually starting rebuilds.
2618+
* auto_recovery (true for leader change, false for manual) applies to both sys_self_heal and prop.
2619+
*/
26192620
int
26202621
ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys_self_heal,
2621-
uint64_t delay_sec)
2622+
bool auto_recovery, uint64_t delay_sec)
26222623
{
26232624
struct daos_prop_entry *entry;
26242625
char *env;
26252626
int rc = 0;
26262627

26272628
rebuild_gst.rg_abort = 0;
26282629

2629-
if (!(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) {
2630+
if (auto_recovery && !(sys_self_heal & DS_MGMT_SELF_HEAL_POOL_REBUILD)) {
26302631
D_DEBUG(DB_REBUILD, DF_UUID ": pool_rebuild disabled in sys_self_heal\n",
26312632
DP_UUID(pool->sp_uuid));
26322633
return DER_SUCCESS;
@@ -2648,10 +2649,8 @@ ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys
26482649
}
26492650

26502651
entry = daos_prop_entry_get(prop, DAOS_PROP_PO_SELF_HEAL);
2651-
26522652
D_ASSERT(entry != NULL);
2653-
if (entry->dpe_val & (DAOS_SELF_HEAL_AUTO_REBUILD | DAOS_SELF_HEAL_DELAY_REBUILD) &&
2654-
!pool->sp_disable_rebuild) {
2653+
if (is_pool_rebuild_allowed(pool, entry->dpe_val /* self_heal */, auto_recovery)) {
26552654
rc = regenerate_task_of_type(
26562655
pool, PO_COMP_ST_DOWN,
26572656
entry->dpe_val & DAOS_SELF_HEAL_DELAY_REBUILD ? -1 : delay_sec);
@@ -2662,7 +2661,7 @@ ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop, uint64_t sys
26622661
if (rc != 0)
26632662
return rc;
26642663
} else {
2665-
D_DEBUG(DB_REBUILD, DF_UUID" self healing is disabled\n",
2664+
D_DEBUG(DB_REBUILD, "Pool " DF_UUID " self healing is disabled\n",
26662665
DP_UUID(pool->sp_uuid));
26672666
}
26682667

@@ -2699,7 +2698,8 @@ ds_rebuild_admin_start(struct ds_pool *pool)
26992698
goto out;
27002699
}
27012700

2702-
rc = ds_rebuild_regenerate_task(pool, &prop, DS_MGMT_SELF_HEAL_ALL, 0);
2701+
rc = ds_rebuild_regenerate_task(pool, &prop, DS_MGMT_SELF_HEAL_ALL /* sys_self_heal */,
2702+
false /* auto_recovery */, 0 /* delay_sec */);
27032703
daos_prop_fini(&prop);
27042704
if (rc)
27052705
DL_ERROR(rc, DF_UUID ": regenerate rebuild task failed", DP_UUID(pool->sp_uuid));

src/tests/suite/daos_degrade_ec.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* (C) Copyright 2016-2023 Intel Corporation.
3+
* (C) Copyright 2026 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -38,6 +39,11 @@ degrade_small_sub_setup(void **state)
3839

3940
arg = *state;
4041
arg->no_rebuild = 1;
42+
43+
/* Disable manual rebuilds */
44+
test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS);
45+
46+
/* Disable automatic rebuilds */
4147
rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal",
4248
"exclude");
4349
return rc;
@@ -56,6 +62,11 @@ degrade_sub_setup(void **state)
5662

5763
arg = *state;
5864
arg->no_rebuild = 1;
65+
66+
/* Disable manual rebuilds */
67+
test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS);
68+
69+
/* Disable automatic rebuilds */
5970
rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal",
6071
"exclude");
6172
return rc;
@@ -74,6 +85,11 @@ degrade_sub_rf1_setup(void **state)
7485

7586
arg = *state;
7687
arg->no_rebuild = 1;
88+
89+
/* Disable manual rebuilds */
90+
test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS);
91+
92+
/* Disable automatic rebuilds */
7793
rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal",
7894
"exclude");
7995
return rc;

src/tests/suite/daos_degraded.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* (C) Copyright 2016-2023 Intel Corporation.
3+
* (C) Copyright 2026 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -258,6 +259,11 @@ degraded_setup(void **state)
258259

259260
arg = *state;
260261
arg->no_rebuild = 1;
262+
263+
/* Disable manual rebuilds */
264+
test_set_engine_fail_loc(arg, CRT_NO_RANK, DAOS_REBUILD_DISABLE | DAOS_FAIL_ALWAYS);
265+
266+
/* Disable automatic rebuilds */
261267
rc = daos_pool_set_prop(arg->pool.pool_uuid, "self_heal",
262268
"exclude");
263269
return rc;

0 commit comments

Comments
 (0)