diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index c1d9c9e85cb..4b249a157d6 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -2210,15 +2210,13 @@ crt_ivsync_rpc_issue(struct crt_ivns_internal *ivns_internal, uint32_t class_id, crt_iv_comp_cb_t update_comp_cb, void *cb_arg, void *user_priv, int update_rc) { - crt_rpc_t *corpc_req = NULL; - struct crt_iv_sync_in *input; - int rc = 0; - bool delay_completion = false; - struct iv_sync_cb_info *iv_sync_cb = NULL; - struct crt_iv_ops *iv_ops; - crt_bulk_t local_bulk = CRT_BULK_NULL; - d_rank_list_t excluded_list; - d_rank_t excluded_ranks[1]; /* Excluding self */ + struct crt_iv_sync_in *input; + struct crt_iv_ops *iv_ops; + crt_rpc_t *corpc_req = NULL; + struct iv_sync_cb_info *iv_sync_cb = NULL; + crt_bulk_t local_bulk = CRT_BULK_NULL; + int rc = 0; + bool delay_completion = false; iv_ops = crt_iv_ops_get(ivns_internal, class_id); D_ASSERT(iv_ops != NULL); @@ -2243,10 +2241,6 @@ crt_ivsync_rpc_issue(struct crt_ivns_internal *ivns_internal, uint32_t class_id, D_GOTO(exit, rc = -DER_INVAL); } - /* Exclude self from corpc */ - excluded_list.rl_nr = 1; - excluded_list.rl_ranks = excluded_ranks; - excluded_ranks[0] = ivns_internal->cii_grp_priv->gp_self; /* Perform refresh on local node */ if (sync_type->ivs_event == CRT_IV_SYNC_EVENT_UPDATE) rc = iv_ops->ivo_on_refresh(ivns_internal, iv_key, 0, @@ -2273,13 +2267,9 @@ crt_ivsync_rpc_issue(struct crt_ivns_internal *ivns_internal, uint32_t class_id, } } - rc = crt_corpc_req_create(ivns_internal->cii_ctx, - &ivns_internal->cii_grp_priv->gp_pub, - &excluded_list, - CRT_OPC_IV_SYNC, - local_bulk, NULL, 0, - ivns_internal->cii_gns.gn_tree_topo, - &corpc_req); + rc = crt_corpc_req_create(ivns_internal->cii_ctx, &ivns_internal->cii_grp_priv->gp_pub, + NULL, CRT_OPC_IV_SYNC, local_bulk, NULL, 0, + ivns_internal->cii_gns.gn_tree_topo, &corpc_req); if (rc != 0) { D_ERROR("crt_corpc_req_create(): "DF_RC"\n", DP_RC(rc)); D_GOTO(exit, rc); diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index 39821ec6328..598de3b25be 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -183,9 +184,7 @@ btr_ops_t chk_pool_ops = { struct chk_pending_bundle { d_list_t *cpb_pool_head; d_list_t *cpb_rank_head; - d_rank_t cpb_rank; - uuid_t cpb_uuid; - uint32_t cpb_class; + struct chk_report_unit *cpb_cru; uint64_t cpb_seq; }; @@ -210,11 +209,14 @@ chk_pending_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, struct chk_pending_bundle *cpb = val_iov->iov_buf; struct chk_pending_rec *cpr = NULL; int rc = 0; + int i; + size_t size; D_ASSERT(cpb != NULL); D_ASSERT(val_out != NULL); - D_ALLOC_PTR(cpr); + size = sizeof(*cpr) + sizeof(uint32_t) * cpb->cpb_cru->cru_option_nr; + D_ALLOC(cpr, size); if (cpr == NULL) D_GOTO(out, rc = -DER_NOMEM); @@ -226,11 +228,14 @@ chk_pending_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, if (rc != 0) D_GOTO(out, rc = dss_abterr2der(rc)); - uuid_copy(cpr->cpr_uuid, cpb->cpb_uuid); - cpr->cpr_seq = cpb->cpb_seq; - cpr->cpr_rank = cpb->cpb_rank; - cpr->cpr_class = cpb->cpb_class; - cpr->cpr_action = CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT; + uuid_copy(cpr->cpr_uuid, *(cpb->cpb_cru->cru_pool)); + cpr->cpr_seq = cpb->cpb_seq; + cpr->cpr_rank = cpb->cpb_cru->cru_rank; + cpr->cpr_class = cpb->cpb_cru->cru_cla; + cpr->cpr_action = CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT; + cpr->cpr_option_nr = cpb->cpb_cru->cru_option_nr; + for (i = 0; i < cpr->cpr_option_nr; i++) + cpr->cpr_options[i] = cpb->cpb_cru->cru_options[i]; if (cpb->cpb_rank_head != NULL) d_list_add_tail(&cpr->cpr_rank_link, cpb->cpb_rank_head); @@ -332,20 +337,15 @@ chk_ranks_dump(uint32_t rank_nr, d_rank_t *ranks) D_INFO("Ranks List:\n"); while (rank_nr >= 8) { - D_INFO("%8u %8u %8u %8u %8u %8u %8u %8u\n", - ranks[0], ranks[1], ranks[2], ranks[3], + D_INFO("%8u%8u%8u%8u%8u%8u%8u%8u\n", ranks[0], ranks[1], ranks[2], ranks[3], ranks[4], ranks[5], ranks[6], ranks[7]); rank_nr -= 8; ranks += 8; } if (rank_nr > 0) { - rc = snprintf(ptr, 79, "%8u", ranks[0]); - D_ASSERT(rc > 0); - ptr += rc; - - for (i = 1; i < rank_nr; i++) { - rc = snprintf(ptr, 79 - 8 * i, " %8u", ranks[i]); + for (i = 0; i < rank_nr; i++) { + rc = snprintf(ptr, 79 - 8 * i, "%8u", ranks[i]); D_ASSERT(rc > 0); ptr += rc; } @@ -889,8 +889,8 @@ chk_pool_shard_cleanup(struct chk_instance *ins) } int -chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, - uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr) +chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, + struct chk_report_unit *cru, uint64_t seq, struct chk_pending_rec **cpr) { struct chk_pending_bundle rbund; d_iov_t kiov; @@ -900,12 +900,10 @@ chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_he D_ASSERT(cpr != NULL); - uuid_copy(rbund.cpb_uuid, uuid); rbund.cpb_pool_head = pool_head; rbund.cpb_rank_head = rank_head; - rbund.cpb_seq = seq; - rbund.cpb_rank = rank; - rbund.cpb_class = cla; + rbund.cpb_seq = seq; + rbund.cpb_cru = cru; d_iov_set(&viov, NULL, 0); d_iov_set(&riov, &rbund, sizeof(rbund)); @@ -922,36 +920,55 @@ chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_he ABT_rwlock_unlock(ins->ci_abt_lock); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, - "Add pending record with gen "DF_X64", seq "DF_X64", rank %u, class %u: "DF_RC"\n", - ins->ci_bk.cb_gen, seq, rank, cla, DP_RC(rc)); + "Add pending record, gen " DF_X64 ", seq " DF_X64 ", rank %u, cla %u: " DF_RC "\n", + ins->ci_bk.cb_gen, seq, cru->cru_rank, cru->cru_cla, DP_RC(rc)); return rc; } int -chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked, struct chk_pending_rec **cpr) +chk_pending_del(struct chk_instance *ins, uint64_t seq, uint32_t act, bool locked, + struct chk_pending_rec **cpr) { d_iov_t kiov; d_iov_t riov; int rc; + int i; + bool matched = false; d_iov_set(&riov, NULL, 0); d_iov_set(&kiov, &seq, sizeof(seq)); if (!locked) ABT_rwlock_wrlock(ins->ci_abt_lock); - rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, &riov); - if (!locked) - ABT_rwlock_unlock(ins->ci_abt_lock); + rc = dbtree_lookup(ins->ci_pending_hdl, &kiov, &riov); + if (rc != 0) + goto out; - if (rc == 0) - *cpr = (struct chk_pending_rec *)riov.iov_buf; + *cpr = (struct chk_pending_rec *)riov.iov_buf; + for (i = 0; i < (*cpr)->cpr_option_nr; i++) { + if ((*cpr)->cpr_options[i] == act) { + matched = true; + break; + } + } + + if (matched) + rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ | BTR_PROBE_BYPASS, &kiov, + &riov); else - *cpr = NULL; + rc = -DER_MISMATCH; + +out: + if (!locked) + ABT_rwlock_unlock(ins->ci_abt_lock); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, - "Del pending record with gen "DF_X64", seq "DF_X64": "DF_RC"\n", - ins->ci_bk.cb_gen, seq, DP_RC(rc)); + "Del pending record, gen " DF_X64 ", seq " DF_X64 ", act %u: " DF_RC "\n", + ins->ci_bk.cb_gen, seq, act, DP_RC(rc)); + + if (rc != 0) + *cpr = NULL; return rc; } @@ -1032,9 +1049,9 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, /* Reuse former policies if "policy_nr == 0". */ if (policy_nr > 0) { - memset(prop->cp_policies, 0, sizeof(Chk__CheckInconsistAction) * CHK_POLICY_MAX); + memset(prop->cp_policies, 0, sizeof(Chk__CheckInconsistAction) * CHK_CLASS_MAX); for (i = 0; i < policy_nr; i++) { - if (unlikely(policies[i].cp_class >= CHK_POLICY_MAX)) { + if (unlikely(policies[i].cp_class >= CHK_CLASS_MAX)) { D_ERROR("Invalid DAOS inconsistency class %u\n", policies[i].cp_class); D_GOTO(out, rc = -DER_INVAL); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index ad61af851ce..21843c2dd33 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2642,7 +2643,7 @@ chk_engine_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo struct chk_pending_rec *cpr = NULL; int rc; - rc = chk_pending_del(ins, seq, locked, &cpr); + rc = chk_pending_del(ins, seq, act, locked, &cpr); if (rc == 0) { /* The cpr will be destroyed by the waiter via chk_engine_report(). */ D_ASSERT(cpr->cpr_busy); @@ -2674,12 +2675,13 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t struct chk_pool_rec *pool_tmp = NULL; struct chk_pending_rec *cpr = NULL; struct chk_pending_rec *cpr_tmp = NULL; - int rc; + int rc = 0; + int rc1; if (ins->ci_bk.cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (unlikely(cla >= CHK_POLICY_MAX)) { + if (unlikely(cla >= CHK_CLASS_MAX)) { D_ERROR("Invalid DAOS inconsistency class %u\n", cla); D_GOTO(out, rc = -DER_INVAL); } @@ -2690,12 +2692,12 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t D_GOTO(out, rc = -DER_INVAL); } - rc = chk_engine_act_internal(ins, seq, act, false); - if (rc == -DER_NONEXIST || rc == -DER_NO_HDL) - rc = 0; - - if (rc != 0 || !(flags & CAF_FOR_ALL)) + if (!(flags & CAF_FOR_ALL)) { + rc = chk_engine_act_internal(ins, seq, act, false); + if (rc == -DER_NONEXIST || rc == -DER_NO_HDL) + rc = 0; goto out; + } if (likely(prop->cp_policies[cla] != act)) { prop->cp_policies[cla] = act; @@ -2709,23 +2711,20 @@ chk_engine_act(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, uint32_t d_list_for_each_entry(pool, &ins->ci_pool_list, cpr_link) chk_pool_get(pool); + ABT_rwlock_wrlock(ins->ci_abt_lock); d_list_for_each_entry_safe(pool, pool_tmp, &ins->ci_pool_list, cpr_link) { - if (rc == 0) { - ABT_rwlock_wrlock(ins->ci_abt_lock); - d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list, - cpr_pool_link) { - if (cpr->cpr_class != cla || - cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) - continue; - - rc = chk_engine_act_internal(ins, cpr->cpr_seq, act, true); - if (rc != 0) - break; - } - ABT_rwlock_unlock(ins->ci_abt_lock); + d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list, cpr_pool_link) { + if (cpr->cpr_class != cla || + cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) + continue; + + rc1 = chk_engine_act_internal(ins, cpr->cpr_seq, act, true); + if (rc1 != 0 && rc == 0) + rc = rc1; } chk_pool_put(pool); } + ABT_rwlock_unlock(ins->ci_abt_lock); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, @@ -3148,8 +3147,7 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) pool = (struct chk_pool_rec *)riov.iov_buf; - rc = chk_pending_add(ins, &pool->cpr_pending_list, NULL, *cru->cru_pool, *seq, - cru->cru_rank, cru->cru_cla, &cpr); + rc = chk_pending_add(ins, &pool->cpr_pending_list, NULL, cru, *seq, &cpr); if (unlikely(rc == -DER_AGAIN)) goto new_seq; @@ -3166,9 +3164,9 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) if (unlikely(rc == -DER_AGAIN)) { D_ASSERT(cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT); - rc = chk_pending_del(ins, *seq, false, &tmp); + rc = chk_pending_del(ins, *seq, cru->cru_options[0], false, &tmp); if (rc == 0) - D_ASSERT(tmp == NULL); + D_ASSERT(tmp == cpr); else if (rc != -DER_NONEXIST) goto log; diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 9ab16b060b3..0eac1d5e896 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -312,7 +313,7 @@ CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN); * NOTE: Please be careful when change CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN * to avoid hole is the struct chk_property. */ -#define CHK_POLICY_MAX (CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN + 1) +#define CHK_CLASS_MAX (CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN + 1) struct chk_co_rpc_cb_args { void *cb_priv; @@ -486,7 +487,7 @@ struct chk_bookmark { struct chk_property { d_rank_t cp_leader; Chk__CheckFlag cp_flags; - Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; + Chk__CheckInconsistAction cp_policies[CHK_CLASS_MAX]; /* * NOTE: Preserve for supporting to continue the check until the specified phase in the * future. -1 means to check all phases. @@ -621,23 +622,6 @@ struct chk_pool_rec { ABT_cond cpr_cond; }; -struct chk_pending_rec { - /* Link into chk_pool_rec::cpr_pending_list. */ - d_list_t cpr_pool_link; - /* Link into chk_rank_rec::crr_pending_list. */ - d_list_t cpr_rank_link; - uuid_t cpr_uuid; - uint64_t cpr_seq; - d_rank_t cpr_rank; - uint32_t cpr_class; - uint32_t cpr_action; - uint32_t cpr_busy:1, - cpr_exiting:1, - cpr_on_leader:1; - ABT_mutex cpr_mutex; - ABT_cond cpr_cond; -}; - struct chk_report_unit { uint64_t cru_gen; uint32_t cru_cla; @@ -660,6 +644,23 @@ struct chk_report_unit { uint32_t cru_result; }; +struct chk_pending_rec { + /* Link into chk_pool_rec::cpr_pending_list. */ + d_list_t cpr_pool_link; + /* Link into chk_rank_rec::crr_pending_list. */ + d_list_t cpr_rank_link; + uuid_t cpr_uuid; + uint64_t cpr_seq; + uint32_t cpr_rank; + uint32_t cpr_class; + uint32_t cpr_action; + uint32_t cpr_busy : 1, cpr_exiting : 1, cpr_on_leader : 1; + ABT_mutex cpr_mutex; + ABT_cond cpr_cond; + uint32_t cpr_option_nr; + uint32_t cpr_options[0]; +}; + struct chk_traverse_pools_args { uint64_t ctpa_gen; struct chk_instance *ctpa_ins; @@ -725,11 +726,13 @@ int chk_pool_add_shard(daos_handle_t hdl, d_list_t *head, uuid_t uuid, d_rank_t void chk_pool_shard_cleanup(struct chk_instance *ins); -int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, - uint64_t seq, uint32_t rank, uint32_t cla, struct chk_pending_rec **cpr); +/* clang-format off */ +int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, + struct chk_report_unit *cru, uint64_t seq, struct chk_pending_rec **cpr); -int chk_pending_del(struct chk_instance *ins, uint64_t seq, bool locked, +int chk_pending_del(struct chk_instance *ins, uint64_t seq, uint32_t act, bool locked, struct chk_pending_rec **cpr); +/* clang-format on */ int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr); diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index 0e832b38f18..14f04f277a9 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -3389,12 +3390,11 @@ chk_leader_prop(chk_prop_cb_t prop_cb, void *buf) { struct chk_property *prop = &chk_leader->ci_prop; - return prop_cb(buf, prop->cp_policies, CHK_POLICY_MAX - 1, prop->cp_flags); + return prop_cb(buf, prop->cp_policies, CHK_CLASS_MAX - 1, prop->cp_flags); } static int -chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bool for_all, - bool locked, uint32_t *cla) +chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bool for_all) { struct chk_pending_rec *pending = NULL; struct chk_pool_rec *pool = NULL; @@ -3402,7 +3402,7 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo d_iov_t riov; int rc; - rc = chk_pending_del(ins, seq, locked, &pending); + rc = chk_pending_del(ins, seq, act, for_all, &pending); if (rc != 0) goto out; @@ -3417,9 +3417,6 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo pending->cpr_action = act; ABT_cond_broadcast(pending->cpr_cond); ABT_mutex_unlock(pending->cpr_mutex); - - if (cla != NULL) - *cla = pending->cpr_class; } else { d_iov_set(&riov, NULL, 0); d_iov_set(&kiov, pending->cpr_uuid, sizeof(uuid_t)); @@ -3432,8 +3429,8 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act, bo rc = 0; } - /* For locked case, check engines have already processed related interaction. */ - if (!locked) + /* "for_all" case, check engines have already processed related interaction. */ + if (!for_all) rc = chk_act_remote(ins->ci_ranks, ins->ci_bk.cb_gen, seq, pending->cpr_class, act, pending->cpr_rank, for_all); @@ -3458,8 +3455,9 @@ chk_leader_act(uint64_t seq, uint32_t act, bool for_all) struct chk_pool_rec *pool_tmp = NULL; struct chk_pending_rec *cpr = NULL; struct chk_pending_rec *cpr_tmp = NULL; - uint32_t cla = 0; - int rc; + uint32_t cla = seq; + int rc = 0; + int rc1; if (cbk->cb_magic != CHK_BK_MAGIC_LEADER) D_GOTO(out, rc = -DER_NOTLEADER); @@ -3474,15 +3472,25 @@ chk_leader_act(uint64_t seq, uint32_t act, bool for_all) D_GOTO(out, rc = -DER_INVAL); } - rc = chk_leader_act_internal(ins, seq, act, for_all, false, &cla); - if (rc != 0 || !for_all) + if (!for_all) { + rc = chk_leader_act_internal(ins, seq, act, for_all); goto out; + } + + if (cla >= CHK_CLASS_MAX) { + D_ERROR("Invalid DAOS inconsistency class %u\n", cla); + D_GOTO(out, rc = -DER_INVAL); + } if (likely(prop->cp_policies[cla] != act)) { prop->cp_policies[cla] = act; chk_prop_update(prop, NULL); } + rc = chk_act_remote(ins->ci_ranks, ins->ci_bk.cb_gen, seq, cla, act, -1, true); + if (rc != 0) + goto out; + /* * Hold reference on each to guarantee that the next 'tmp' will not be unlinked from the * pool list during current pool process. @@ -3490,24 +3498,20 @@ chk_leader_act(uint64_t seq, uint32_t act, bool for_all) d_list_for_each_entry(pool, &ins->ci_pool_list, cpr_link) chk_pool_get(pool); + ABT_rwlock_wrlock(ins->ci_abt_lock); d_list_for_each_entry_safe(pool, pool_tmp, &ins->ci_pool_list, cpr_link) { - if (rc == 0) { - ABT_rwlock_wrlock(ins->ci_abt_lock); - d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list, - cpr_pool_link) { - if (cpr->cpr_class != cla || - cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) - continue; + d_list_for_each_entry_safe(cpr, cpr_tmp, &pool->cpr_pending_list, cpr_pool_link) { + if (cpr->cpr_class != cla || + cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) + continue; - rc = chk_leader_act_internal(ins, cpr->cpr_seq, act, false, true, - NULL); - if (rc != 0) - break; - } - ABT_rwlock_unlock(ins->ci_abt_lock); + rc1 = chk_leader_act_internal(ins, cpr->cpr_seq, act, for_all); + if (rc1 != 0 && rc == 0) + rc = rc1; } chk_pool_put(pool); } + ABT_rwlock_unlock(ins->ci_abt_lock); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, @@ -3575,8 +3579,7 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) } rc = chk_pending_add(ins, &pool->cpr_pending_list, - crr != NULL ? &crr->crr_pending_list : NULL, - *cru->cru_pool, *seq, cru->cru_rank, cru->cru_cla, &cpr); + crr != NULL ? &crr->crr_pending_list : NULL, cru, *seq, &cpr); if (decision != NULL) { if (unlikely(rc == -DER_AGAIN)) goto new_seq; diff --git a/src/control/cmd/dmg/check.go b/src/control/cmd/dmg/check.go index 3b0144f9079..ca920f32b4d 100644 --- a/src/control/cmd/dmg/check.go +++ b/src/control/cmd/dmg/check.go @@ -1,5 +1,6 @@ // // (C) Copyright 2022-2023 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -361,39 +362,46 @@ type checkRepairCmd struct { ForAll bool `short:"f" long:"for-all" description:"Take the same action for all inconsistencies with the same class."` Args struct { - SeqNum repairSeqNum `positional-arg-name:"[seq-num]" required:"1"` - SelectedAction int `positional-arg-name:"[action]" required:"1"` + SeqNum repairSeqNum `positional-arg-name:"seq-num|inconsistency-class" required:"1"` + SelectedAction int `positional-arg-name:"interact-opt|action" required:"1"` } `positional-args:"yes"` } func (cmd *checkRepairCmd) Execute(_ []string) error { ctx := context.Background() - qReq := new(control.SystemCheckQueryReq) - qReq.Seqs = []uint64{uint64(cmd.Args.SeqNum)} - qResp, err := control.SystemCheckQuery(ctx, cmd.ctlInvoker, qReq) - if err != nil { - return err - } - - if len(qResp.Reports) == 0 { - return errors.Errorf("no report found for seq %s", cmd.Args.SeqNum) - } - - report := qResp.Reports[0] - if !report.IsInteractive() { - return errors.Errorf("finding %s is already resolved: %s", cmd.Args.SeqNum, report.Resolution()) - } - choices := report.RepairChoices() - if cmd.Args.SelectedAction < 0 || cmd.Args.SelectedAction >= len(choices) { - return errors.Errorf("invalid action %d for seq %s", cmd.Args.SelectedAction, cmd.Args.SeqNum) - } - req := new(control.SystemCheckRepairReq) req.Seq = uint64(cmd.Args.SeqNum) req.ForAll = cmd.ForAll - if err := req.SetAction(int32(choices[cmd.Args.SelectedAction].Action)); err != nil { - return err + + if cmd.ForAll { + if err := req.SetAction(int32(cmd.Args.SelectedAction)); err != nil { + return err + } + } else { + qReq := new(control.SystemCheckQueryReq) + qReq.Seqs = []uint64{uint64(cmd.Args.SeqNum)} + qResp, err := control.SystemCheckQuery(ctx, cmd.ctlInvoker, qReq) + if err != nil { + return err + } + + if len(qResp.Reports) == 0 { + return errors.Errorf("no report found for seq %s", cmd.Args.SeqNum) + } + + report := qResp.Reports[0] + if !report.IsInteractive() { + return errors.Errorf("finding %s is already resolved: %s", cmd.Args.SeqNum, report.Resolution()) + } + choices := report.RepairChoices() + if cmd.Args.SelectedAction < 0 || cmd.Args.SelectedAction >= len(choices) { + return errors.Errorf("invalid action %d for seq %s", cmd.Args.SelectedAction, cmd.Args.SeqNum) + } + + if err := req.SetAction(int32(choices[cmd.Args.SelectedAction].Action)); err != nil { + return err + } } if err := control.SystemCheckRepair(ctx, cmd.ctlInvoker, req); err != nil { diff --git a/src/control/server/mgmt_check.go b/src/control/server/mgmt_check.go index 4ae2159f2d7..cb7c09dd4ee 100644 --- a/src/control/server/mgmt_check.go +++ b/src/control/server/mgmt_check.go @@ -1,6 +1,7 @@ // // (C) Copyright 2022-2024 Intel Corporation. // (C) Copyright 2025 Google LLC +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -528,13 +529,15 @@ func (svc *mgmtSvc) SystemCheckRepair(ctx context.Context, req *mgmtpb.CheckActR return nil, err } - f, err := svc.sysdb.GetCheckerFinding(req.Seq) - if err != nil { - return nil, err - } + if !req.ForAll { + f, err := svc.sysdb.GetCheckerFinding(req.Seq) + if err != nil { + return nil, err + } - if !f.HasChoice(req.Act) { - return nil, errors.Errorf("invalid action %s (must be one of %s)", req.Act, f.ValidChoicesString()) + if !f.HasChoice(req.Act) { + return nil, errors.Errorf("invalid action %s (must be one of %s)", req.Act, f.ValidChoicesString()) + } } dResp, err := svc.makeCheckerCall(ctx, drpc.MethodCheckerAction, req) @@ -547,7 +550,7 @@ func (svc *mgmtSvc) SystemCheckRepair(ctx context.Context, req *mgmtpb.CheckActR return nil, errors.Wrap(err, "unmarshal CheckRepair response") } - if resp.Status == 0 { + if !req.ForAll && resp.Status == 0 { if err := svc.sysdb.SetCheckerFindingAction(req.Seq, int32(req.Act)); err != nil { return nil, err } diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 12b4a9958dc..f448ea7b8cb 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -242,7 +243,7 @@ cr_system_stop(bool force) } static inline int -cr_rank_reint(uint32_t rank, bool start) +cr_rank_reint(uint32_t rank) { int rc; @@ -251,10 +252,8 @@ cr_rank_reint(uint32_t rank, bool start) if (rc != 0) return rc; - if (start) { - print_message("CR: starting the rank %u ...\n", rank); - rc = dmg_system_start_rank(dmg_config_file, rank); - } + print_message("CR: starting the rank %u ...\n", rank); + rc = dmg_system_start_rank(dmg_config_file, rank); return rc; } @@ -1338,8 +1337,7 @@ cr_repair_forall_leader(void **state) { test_arg_t *arg = *state; struct test_pool pools[2] = { 0 }; - struct daos_check_info dci = { 0 }; - struct daos_check_report_info *dcri; + struct daos_check_info dci = {0}; char *ps_label = NULL; char *ptr; char ms_label[DAOS_PROP_LABEL_MAX_LEN]; @@ -1375,16 +1373,8 @@ cr_repair_forall_leader(void **state) rc = cr_pool_verify(&dci, pools[0].pool_uuid, TCPS_PENDING, 1, &class, &action, NULL); assert_rc_equal(rc, 0); - dcri = cr_locate_dcri(&dci, NULL, pools[0].pool_uuid); action = TCA_TRUST_PS; - rc = -DER_MISC; - - for (i = 0; i < dcri->dcri_option_nr; i++) { - if (dcri->dcri_options[i] == action) { - rc = cr_check_repair(dcri->dcri_seq, i, true); - break; - } - } + rc = cr_check_repair(class, action, true); assert_rc_equal(rc, 0); for (i = 0; i < 2; i++) { @@ -1444,8 +1434,7 @@ cr_repair_forall_engine(void **state) test_arg_t *arg = *state; struct test_pool pools[2] = { 0 }; struct test_cont conts[2] = { 0 }; - struct daos_check_info dci = { 0 }; - struct daos_check_report_info *dcri; + struct daos_check_info dci = {0}; char *target_label = NULL; char *ptr; char ps_label[DAOS_PROP_LABEL_MAX_LEN]; @@ -1484,16 +1473,8 @@ cr_repair_forall_engine(void **state) rc = cr_pool_verify(&dci, pools[0].pool_uuid, TCPS_PENDING, 1, &class, &action, NULL); assert_rc_equal(rc, 0); - dcri = cr_locate_dcri(&dci, NULL, pools[0].pool_uuid); action = TCA_TRUST_TARGET; - rc = -DER_MISC; - - for (i = 0; i < dcri->dcri_option_nr; i++) { - if (dcri->dcri_options[i] == action) { - rc = cr_check_repair(dcri->dcri_seq, i, true); - break; - } - } + rc = cr_check_repair(class, action, true); assert_rc_equal(rc, 0); for (i = 0; i < 2; i++) { @@ -2858,7 +2839,7 @@ cr_engine_death(void **state) assert_rc_equal(rc, 0); /* Reint the rank for subsequent test. */ - rc = cr_rank_reint(rank, false); + rc = cr_rank_reint(rank); assert_rc_equal(rc, 0); rc = cr_mode_switch(false); @@ -2941,7 +2922,7 @@ cr_engine_rejoin_succ(void **state) assert_rc_equal(rc, 0); /* Reint the rank immediately before the rank death event being detected. */ - rc = cr_rank_reint(rank, true); + rc = cr_rank_reint(rank); assert_rc_equal(rc, 0); cr_pool_wait(1, &pool.pool_uuid, &dci); @@ -3084,7 +3065,7 @@ cr_engine_rejoin_fail(void **state) "Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result); /* Reint the rank, rejoin will fail but not affect the rank start. */ - rc = cr_rank_reint(rank, true); + rc = cr_rank_reint(rank); assert_rc_equal(rc, 0); /* Wait for a while until the control plane to be ready for new check start. */ @@ -3108,10 +3089,6 @@ cr_engine_rejoin_fail(void **state) } assert_rc_equal(rc, 0); - /* The former excluded rank is not in the check ranks set, stop it explicitly. */ - rc = dmg_system_stop_rank(dmg_config_file, rank, false); - assert_rc_equal(rc, 0); - rc = cr_mode_switch(false); assert_rc_equal(rc, 0); @@ -3409,8 +3386,7 @@ cr_inherit_policy(void **state) { test_arg_t *arg = *state; struct test_pool pools[2] = { 0 }; - struct daos_check_info dci = { 0 }; - struct daos_check_report_info *dcri; + struct daos_check_info dci = {0}; char *ps_label = NULL; char *ptr; char ms_label[DAOS_PROP_LABEL_MAX_LEN]; @@ -3449,16 +3425,8 @@ cr_inherit_policy(void **state) assert_rc_equal(rc, 0); } - dcri = cr_locate_dcri(&dci, NULL, pools[1].pool_uuid); action = TCA_TRUST_PS; - rc = -DER_MISC; - - for (i = 0; i < dcri->dcri_option_nr; i++) { - if (dcri->dcri_options[i] == action) { - rc = cr_check_repair(dcri->dcri_seq, i, true); - break; - } - } + rc = cr_check_repair(class, action, true); assert_rc_equal(rc, 0); for (i = 0; i < 2; i++) { @@ -3785,6 +3753,64 @@ cr_maintenance_mode(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Exclude rank 0. + * 2. Create pool without inconsistency. + * 3. Start checker without options. + * 4. Query checker, it should be completed instead of being blocked. + * 5. Switch to normal mode and cleanup. + */ +static void +cr_lost_rank0(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + int rc; + + FAULT_INJECTION_REQUIRED(); + + print_message("CR29: run CR with rank 0 excluded at the beginning\n"); + + print_message("CR: excluding the rank 0 ...\n"); + rc = dmg_system_exclude_rank(dmg_config_file, 0); + assert_rc_equal(rc, 0); + + rc = cr_pool_create(state, &pool, false, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + /* Reint the rank for subsequent test. */ + rc = cr_rank_reint(0); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + +/* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3842,7 +3868,10 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool2, async_disable, test_case_teardown}, { "CR28: maintenance mode after dry-run check", cr_maintenance_mode, async_disable, test_case_teardown}, + { "CR29: run CR with rank 0 excluded at the beginning", + cr_lost_rank0, async_disable, test_case_teardown}, }; +/* clang-format on */ static int cr_setup(void **state)