Skip to content

Commit cfdd691

Browse files
committed
DAOS-17422 chk: enhance dmg check repair API
When -f option is specified for "dmg check repair" command, we allows the user/admin to handle the same type of inconsistencies with the same action subsequently. Under such scenario, using the inconsistency-class is more clear instead of the seq-num. Then the SYNOPSIS will be as following: dmg [OPTIONS] check repair [repair-OPTIONS] <seq-num|inconsistency-class> <interact-opt|action> ... [repair command options] -f, --for-all Take the same action for all (potential) inconsistencies with the same class. If this option is specified, then "inconsistency-class" and "action" will be accepted for subsequent parameters; otherwise, "seq-num" and "interact-opt" will be used. Enhance test logic to make it to be workable after landing c9745d8. Test-tag: test_daos_cat_recov_core Signed-off-by: Fan Yong <[email protected]>
1 parent e9b6f9b commit cfdd691

File tree

7 files changed

+187
-187
lines changed

7 files changed

+187
-187
lines changed

src/chk/chk_common.c

Lines changed: 53 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* (C) Copyright 2022-2024 Intel Corporation.
3+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -183,9 +184,7 @@ btr_ops_t chk_pool_ops = {
183184
struct chk_pending_bundle {
184185
d_list_t *cpb_pool_head;
185186
d_list_t *cpb_rank_head;
186-
d_rank_t cpb_rank;
187-
uuid_t cpb_uuid;
188-
uint32_t cpb_class;
187+
struct chk_report_unit *cpb_cru;
189188
uint64_t cpb_seq;
190189
};
191190

@@ -210,11 +209,14 @@ chk_pending_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov,
210209
struct chk_pending_bundle *cpb = val_iov->iov_buf;
211210
struct chk_pending_rec *cpr = NULL;
212211
int rc = 0;
212+
int i;
213+
size_t size;
213214

214215
D_ASSERT(cpb != NULL);
215216
D_ASSERT(val_out != NULL);
216217

217-
D_ALLOC_PTR(cpr);
218+
size = sizeof(*cpr) + sizeof(uint32_t) * cpb->cpb_cru->cru_option_nr;
219+
D_ALLOC(cpr, size);
218220
if (cpr == NULL)
219221
D_GOTO(out, rc = -DER_NOMEM);
220222

@@ -226,11 +228,14 @@ chk_pending_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov,
226228
if (rc != 0)
227229
D_GOTO(out, rc = dss_abterr2der(rc));
228230

229-
uuid_copy(cpr->cpr_uuid, cpb->cpb_uuid);
230-
cpr->cpr_seq = cpb->cpb_seq;
231-
cpr->cpr_rank = cpb->cpb_rank;
232-
cpr->cpr_class = cpb->cpb_class;
233-
cpr->cpr_action = CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT;
231+
uuid_copy(cpr->cpr_uuid, *(cpb->cpb_cru->cru_pool));
232+
cpr->cpr_seq = cpb->cpb_seq;
233+
cpr->cpr_rank = cpb->cpb_cru->cru_rank;
234+
cpr->cpr_class = cpb->cpb_cru->cru_cla;
235+
cpr->cpr_action = CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT;
236+
cpr->cpr_option_nr = cpb->cpb_cru->cru_option_nr;
237+
for (i = 0; i < cpr->cpr_option_nr; i++)
238+
cpr->cpr_options[i] = cpb->cpb_cru->cru_options[i];
234239

235240
if (cpb->cpb_rank_head != NULL)
236241
d_list_add_tail(&cpr->cpr_rank_link, cpb->cpb_rank_head);
@@ -332,20 +337,15 @@ chk_ranks_dump(uint32_t rank_nr, d_rank_t *ranks)
332337
D_INFO("Ranks List:\n");
333338

334339
while (rank_nr >= 8) {
335-
D_INFO("%8u %8u %8u %8u %8u %8u %8u %8u\n",
336-
ranks[0], ranks[1], ranks[2], ranks[3],
340+
D_INFO("%8u%8u%8u%8u%8u%8u%8u%8u\n", ranks[0], ranks[1], ranks[2], ranks[3],
337341
ranks[4], ranks[5], ranks[6], ranks[7]);
338342
rank_nr -= 8;
339343
ranks += 8;
340344
}
341345

342346
if (rank_nr > 0) {
343-
rc = snprintf(ptr, 79, "%8u", ranks[0]);
344-
D_ASSERT(rc > 0);
345-
ptr += rc;
346-
347-
for (i = 1; i < rank_nr; i++) {
348-
rc = snprintf(ptr, 79 - 8 * i, " %8u", ranks[i]);
347+
for (i = 0; i < rank_nr; i++) {
348+
rc = snprintf(ptr, 79 - 8 * i, "%8u", ranks[i]);
349349
D_ASSERT(rc > 0);
350350
ptr += rc;
351351
}
@@ -889,8 +889,8 @@ chk_pool_shard_cleanup(struct chk_instance *ins)
889889
}
890890

891891
int
892-
chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid,
893-
uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr)
892+
chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head,
893+
struct chk_report_unit *cru, uint64_t seq, struct chk_pending_rec **cpr)
894894
{
895895
struct chk_pending_bundle rbund;
896896
d_iov_t kiov;
@@ -900,12 +900,10 @@ chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_he
900900

901901
D_ASSERT(cpr != NULL);
902902

903-
uuid_copy(rbund.cpb_uuid, uuid);
904903
rbund.cpb_pool_head = pool_head;
905904
rbund.cpb_rank_head = rank_head;
906-
rbund.cpb_seq = seq;
907-
rbund.cpb_rank = rank;
908-
rbund.cpb_class = cla;
905+
rbund.cpb_seq = seq;
906+
rbund.cpb_cru = cru;
909907

910908
d_iov_set(&viov, NULL, 0);
911909
d_iov_set(&riov, &rbund, sizeof(rbund));
@@ -922,36 +920,55 @@ chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_he
922920
ABT_rwlock_unlock(ins->ci_abt_lock);
923921

924922
D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG,
925-
"Add pending record with gen "DF_X64", seq "DF_X64", rank %u, class %u: "DF_RC"\n",
926-
ins->ci_bk.cb_gen, seq, rank, cla, DP_RC(rc));
923+
"Add pending record, gen " DF_X64 ", seq " DF_X64 ", rank %u, cla %u: " DF_RC "\n",
924+
ins->ci_bk.cb_gen, seq, cru->cru_rank, cru->cru_cla, DP_RC(rc));
927925

928926
return rc;
929927
}
930928

931929
int
932-
chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked, struct chk_pending_rec **cpr)
930+
chk_pending_del(struct chk_instance *ins, uint64_t seq, uint32_t act, bool locked,
931+
struct chk_pending_rec **cpr)
933932
{
934933
d_iov_t kiov;
935934
d_iov_t riov;
936935
int rc;
936+
int i;
937+
bool matched = false;
937938

938939
d_iov_set(&riov, NULL, 0);
939940
d_iov_set(&kiov, &seq, sizeof(seq));
940941

941942
if (!locked)
942943
ABT_rwlock_wrlock(ins->ci_abt_lock);
943-
rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, &riov);
944-
if (!locked)
945-
ABT_rwlock_unlock(ins->ci_abt_lock);
944+
rc = dbtree_lookup(ins->ci_pending_hdl, &kiov, &riov);
945+
if (rc != 0)
946+
goto out;
946947

947-
if (rc == 0)
948-
*cpr = (struct chk_pending_rec *)riov.iov_buf;
948+
*cpr = (struct chk_pending_rec *)riov.iov_buf;
949+
for (i = 0; i < (*cpr)->cpr_option_nr; i++) {
950+
if ((*cpr)->cpr_options[i] == act) {
951+
matched = true;
952+
break;
953+
}
954+
}
955+
956+
if (matched)
957+
rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ | BTR_PROBE_BYPASS, &kiov,
958+
&riov);
949959
else
950-
*cpr = NULL;
960+
rc = -DER_MISMATCH;
961+
962+
out:
963+
if (!locked)
964+
ABT_rwlock_unlock(ins->ci_abt_lock);
951965

952966
D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG,
953-
"Del pending record with gen "DF_X64", seq "DF_X64": "DF_RC"\n",
954-
ins->ci_bk.cb_gen, seq, DP_RC(rc));
967+
"Del pending record, gen " DF_X64 ", seq " DF_X64 ", act %u: " DF_RC "\n",
968+
ins->ci_bk.cb_gen, seq, act, DP_RC(rc));
969+
970+
if (rc != 0)
971+
*cpr = NULL;
955972

956973
return rc;
957974
}
@@ -1032,9 +1049,9 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
10321049

10331050
/* Reuse former policies if "policy_nr == 0". */
10341051
if (policy_nr > 0) {
1035-
memset(prop->cp_policies, 0, sizeof(Chk__CheckInconsistAction) * CHK_POLICY_MAX);
1052+
memset(prop->cp_policies, 0, sizeof(Chk__CheckInconsistAction) * CHK_CLASS_MAX);
10361053
for (i = 0; i < policy_nr; i++) {
1037-
if (unlikely(policies[i].cp_class >= CHK_POLICY_MAX)) {
1054+
if (unlikely(policies[i].cp_class >= CHK_CLASS_MAX)) {
10381055
D_ERROR("Invalid DAOS inconsistency class %u\n",
10391056
policies[i].cp_class);
10401057
D_GOTO(out, rc = -DER_INVAL);

src/chk/chk_engine.c

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* (C) Copyright 2022-2024 Intel Corporation.
3+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -2642,7 +2643,7 @@ chk_engine_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo
26422643
struct chk_pending_rec *cpr = NULL;
26432644
int rc;
26442645

2645-
rc = chk_pending_del(ins, seq, locked, &cpr);
2646+
rc = chk_pending_del(ins, seq, act, locked, &cpr);
26462647
if (rc == 0) {
26472648
/* The cpr will be destroyed by the waiter via chk_engine_report(). */
26482649
D_ASSERT(cpr->cpr_busy);
@@ -2674,12 +2675,13 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t
26742675
struct chk_pool_rec *pool_tmp = NULL;
26752676
struct chk_pending_rec *cpr = NULL;
26762677
struct chk_pending_rec *cpr_tmp = NULL;
2677-
int rc;
2678+
int rc = 0;
2679+
int rc1;
26782680

26792681
if (ins->ci_bk.cb_gen != gen)
26802682
D_GOTO(out, rc = -DER_NOTAPPLICABLE);
26812683

2682-
if (unlikely(cla >= CHK_POLICY_MAX)) {
2684+
if (unlikely(cla >= CHK_CLASS_MAX)) {
26832685
D_ERROR("Invalid DAOS inconsistency class %u\n", cla);
26842686
D_GOTO(out, rc = -DER_INVAL);
26852687
}
@@ -2690,12 +2692,12 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t
26902692
D_GOTO(out, rc = -DER_INVAL);
26912693
}
26922694

2693-
rc = chk_engine_act_internal(ins, seq, act, false);
2694-
if (rc == -DER_NONEXIST || rc == -DER_NO_HDL)
2695-
rc = 0;
2696-
2697-
if (rc != 0 || !(flags & CAF_FOR_ALL))
2695+
if (!(flags & CAF_FOR_ALL)) {
2696+
rc = chk_engine_act_internal(ins, seq, act, false);
2697+
if (rc == -DER_NONEXIST || rc == -DER_NO_HDL)
2698+
rc = 0;
26982699
goto out;
2700+
}
26992701

27002702
if (likely(prop->cp_policies[cla] != act)) {
27012703
prop->cp_policies[cla] = act;
@@ -2709,23 +2711,20 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t
27092711
d_list_for_each_entry(pool, &ins->ci_pool_list, cpr_link)
27102712
chk_pool_get(pool);
27112713

2714+
ABT_rwlock_wrlock(ins->ci_abt_lock);
27122715
d_list_for_each_entry_safe(pool, pool_tmp, &ins->ci_pool_list, cpr_link) {
2713-
if (rc == 0) {
2714-
ABT_rwlock_wrlock(ins->ci_abt_lock);
2715-
d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list,
2716-
cpr_pool_link) {
2717-
if (cpr->cpr_class != cla ||
2718-
cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT)
2719-
continue;
2720-
2721-
rc = chk_engine_act_internal(ins, cpr->cpr_seq, act, true);
2722-
if (rc != 0)
2723-
break;
2724-
}
2725-
ABT_rwlock_unlock(ins->ci_abt_lock);
2716+
d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list, cpr_pool_link) {
2717+
if (cpr->cpr_class != cla ||
2718+
cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT)
2719+
continue;
2720+
2721+
rc1 = chk_engine_act_internal(ins, cpr->cpr_seq, act, true);
2722+
if (rc1 != 0 && rc == 0)
2723+
rc = rc1;
27262724
}
27272725
chk_pool_put(pool);
27282726
}
2727+
ABT_rwlock_unlock(ins->ci_abt_lock);
27292728

27302729
out:
27312730
D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO,
@@ -3148,8 +3147,7 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
31483147

31493148
pool = (struct chk_pool_rec *)riov.iov_buf;
31503149

3151-
rc = chk_pending_add(ins, &pool->cpr_pending_list, NULL, *cru->cru_pool, *seq,
3152-
cru->cru_rank, cru->cru_cla, &cpr);
3150+
rc = chk_pending_add(ins, &pool->cpr_pending_list, NULL, cru, *seq, &cpr);
31533151
if (unlikely(rc == -DER_AGAIN))
31543152
goto new_seq;
31553153

@@ -3166,9 +3164,9 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
31663164
if (unlikely(rc == -DER_AGAIN)) {
31673165
D_ASSERT(cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT);
31683166

3169-
rc = chk_pending_del(ins, *seq, false, &tmp);
3167+
rc = chk_pending_del(ins, *seq, cru->cru_options[0], false, &tmp);
31703168
if (rc == 0)
3171-
D_ASSERT(tmp == NULL);
3169+
D_ASSERT(tmp == cpr);
31723170
else if (rc != -DER_NONEXIST)
31733171
goto log;
31743172

src/chk/chk_internal.h

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* (C) Copyright 2022-2024 Intel Corporation.
3+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -312,7 +313,7 @@ CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN);
312313
* NOTE: Please be careful when change CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN
313314
* to avoid hole is the struct chk_property.
314315
*/
315-
#define CHK_POLICY_MAX (CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN + 1)
316+
#define CHK_CLASS_MAX (CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN + 1)
316317

317318
struct chk_co_rpc_cb_args {
318319
void *cb_priv;
@@ -486,7 +487,7 @@ struct chk_bookmark {
486487
struct chk_property {
487488
d_rank_t cp_leader;
488489
Chk__CheckFlag cp_flags;
489-
Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX];
490+
Chk__CheckInconsistAction cp_policies[CHK_CLASS_MAX];
490491
/*
491492
* NOTE: Preserve for supporting to continue the check until the specified phase in the
492493
* future. -1 means to check all phases.
@@ -621,23 +622,6 @@ struct chk_pool_rec {
621622
ABT_cond cpr_cond;
622623
};
623624

624-
struct chk_pending_rec {
625-
/* Link into chk_pool_rec::cpr_pending_list. */
626-
d_list_t cpr_pool_link;
627-
/* Link into chk_rank_rec::crr_pending_list. */
628-
d_list_t cpr_rank_link;
629-
uuid_t cpr_uuid;
630-
uint64_t cpr_seq;
631-
d_rank_t cpr_rank;
632-
uint32_t cpr_class;
633-
uint32_t cpr_action;
634-
uint32_t cpr_busy:1,
635-
cpr_exiting:1,
636-
cpr_on_leader:1;
637-
ABT_mutex cpr_mutex;
638-
ABT_cond cpr_cond;
639-
};
640-
641625
struct chk_report_unit {
642626
uint64_t cru_gen;
643627
uint32_t cru_cla;
@@ -660,6 +644,23 @@ struct chk_report_unit {
660644
uint32_t cru_result;
661645
};
662646

647+
struct chk_pending_rec {
648+
/* Link into chk_pool_rec::cpr_pending_list. */
649+
d_list_t cpr_pool_link;
650+
/* Link into chk_rank_rec::crr_pending_list. */
651+
d_list_t cpr_rank_link;
652+
uuid_t cpr_uuid;
653+
uint64_t cpr_seq;
654+
uint32_t cpr_rank;
655+
uint32_t cpr_class;
656+
uint32_t cpr_action;
657+
uint32_t cpr_busy : 1, cpr_exiting : 1, cpr_on_leader : 1;
658+
ABT_mutex cpr_mutex;
659+
ABT_cond cpr_cond;
660+
uint32_t cpr_option_nr;
661+
uint32_t cpr_options[0];
662+
};
663+
663664
struct chk_traverse_pools_args {
664665
uint64_t ctpa_gen;
665666
struct chk_instance *ctpa_ins;
@@ -725,11 +726,13 @@ int chk_pool_add_shard(daos_handle_t hdl, d_list_t *head, uuid_t uuid, d_rank_t
725726

726727
void chk_pool_shard_cleanup(struct chk_instance *ins);
727728

728-
int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid,
729-
uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr);
729+
/* clang-format off */
730+
int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head,
731+
struct chk_report_unit *cru, uint64_t seq, struct chk_pending_rec **cpr);
730732

731-
int chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked,
733+
int chk_pending_del(struct chk_instance *ins, uint64_t seq, uint32_t act, bool locked,
732734
struct chk_pending_rec **cpr);
735+
/* clang-format on */
733736

734737
int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr);
735738

0 commit comments

Comments
 (0)