diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index 460c37a50ff..cc899ea92a7 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1073,8 +1073,7 @@ chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_p } int -chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, +chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop) { int rc = 0; @@ -1086,11 +1085,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT; if (flags & CHK__CHECK_FLAG__CF_NO_AUTO) prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO; - prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | - CHK__CHECK_FLAG__CF_ORPHAN_POOL | - CHK__CHECK_FLAG__CF_NO_FAILOUT | - CHK__CHECK_FLAG__CF_NO_AUTO); - prop->cp_phase = phase; + prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL | + CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO); if (ranks != NULL) prop->cp_rank_nr = ranks->rl_nr; @@ -1240,12 +1236,7 @@ chk_ins_cleanup(struct chk_instance *ins) chk_stop_sched(ins); ins->ci_inited = 0; - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); } int diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index c301d55a184..8e93dd4c4e3 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -2045,9 +2045,8 @@ chk_engine_sched(void *args) static int chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint64_t gen, int phase, uint32_t api_flags, - d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint64_t gen, uint32_t api_flags, d_rank_t leader, uint32_t flags) { struct chk_traverse_pools_args ctpa = { 0 }; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2134,8 +2133,7 @@ chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank init: if (!chk_is_on_leader(gen, leader, true)) { - rc = chk_prop_prepare(leader, api_flags, phase, policy_nr, policies, rank_list, - prop); + rc = chk_prop_prepare(leader, api_flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2263,16 +2261,15 @@ chk_engine_pool_filter(uuid_t uuid, void *arg, int *phase) int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], uint32_t api_flags, - int phase, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, + uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues) { - struct chk_instance *ins = chk_engine; - struct chk_bookmark *cbk = &ins->ci_bk; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - int rc; - int rc1; + struct chk_instance *ins = chk_engine; + struct chk_bookmark *cbk = &ins->ci_bk; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2294,12 +2291,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; @@ -2313,27 +2305,20 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (rc != 0) goto out_tree; - rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, - pool_nr, pools, gen, phase, api_flags, leader, flags); + rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, gen, + api_flags, leader, flags); if (rc != 0) goto out_tree; if (chk_is_on_leader(gen, leader, true)) { ins->ci_iv_ns = chk_leader_get_iv_ns(); - if (unlikely(ins->ci_iv_ns == NULL)) - goto out_tree; + D_ASSERT(ins->ci_iv_ns != NULL); + + ins->ci_ns_ver = ns_ver; } else { - uuid_unparse_lower(iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, iv_uuid, leader, ns_ver); if (rc != 0) goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); } uuid_copy(cbk->cb_iv_uuid, iv_uuid); @@ -2367,12 +2352,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic D_WARN(DF_ENGINE" failed to update engine bookmark: "DF_RC"\n", DP_ENGINE(ins), DP_RC(rc1)); } - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); @@ -2380,17 +2360,18 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic ins->ci_starting = 0; out_log: if (rc >= 0) { - D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, phase %d, leader %u, " - "flags %x, iv "DF_UUIDF": rc %d\n", + D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, ns_ver %d, leader %u, " + "flags %x, iv " DF_UUIDF ": rc %d\n", DP_ENGINE(ins), chk_is_ins_reset(ins, api_flags) ? "start" : "resume", - myrank, api_flags, phase, leader, flags, DP_UUID(iv_uuid), rc); + myrank, api_flags, ns_ver, leader, flags, DP_UUID(iv_uuid), rc); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, pool_nr, pools); } else { - D_ERROR(DF_ENGINE" failed to start on rank %u with %d pools, api_flags %x, " - "phase %d, leader %u, flags %x, gen "DF_X64", iv "DF_UUIDF": "DF_RC"\n", - DP_ENGINE(ins), myrank, pool_nr, api_flags, phase, leader, flags, gen, + D_ERROR(DF_ENGINE " failed to start on rank %u with %d pools, api_flags %x, " + "ns_ver %d, leader %u, flags %x, gen " DF_X64 ", iv " DF_UUIDF + ": " DF_RC "\n", + DP_ENGINE(ins), myrank, pool_nr, api_flags, ns_ver, leader, flags, gen, DP_UUID(iv_uuid), DP_RC(rc)); } @@ -2416,7 +2397,7 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags) if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) D_GOTO(log, rc = -DER_NOTAPPLICABLE); - if (ins->ci_starting) + if (ins->ci_starting || ins->ci_rejoining) D_GOTO(log, rc = -DER_BUSY); if (ins->ci_stopping || ins->ci_sched_exiting) @@ -2647,34 +2628,44 @@ chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *ins_status int chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - d_rank_list_t *rank_list = NULL; - int rc = 0; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + int rc = 0; CHK_IS_READY(ins); if (cbk->cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - rc = chk_prop_fetch(prop, &rank_list); - if (rc != 0) - goto out; + /* For check engine on the leader, reload rank list that has been refreshed by leader. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = NULL; + } - D_ASSERT(rank_list != NULL); + if (ins->ci_ranks == NULL) { + rc = chk_prop_fetch(prop, &ins->ci_ranks); + if (rc != 0) + goto out; - /* For check engine on the leader, related rank has already been marked as "dead". */ - if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) - goto group; + /* For check engine on the leader, it's done. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + ins->ci_ns_ver = version; + goto out; + } + } + + if (unlikely(ins->ci_ranks == NULL)) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!chk_remove_rank_from_list(rank_list, rank)) + if (!chk_remove_rank_from_list(ins->ci_ranks, rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; - rc = chk_prop_update(prop, rank_list); - if (rc != 0) - goto out; + rc = chk_prop_update(prop, ins->ci_ranks); + if (rc == 0) + rc = chk_iv_ns_update(ins, version); /* * NOTE: If the rank dead before DAOS check start, then subsequent check start will @@ -2695,19 +2686,7 @@ chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) * sometime later as the DAOS check going. */ -group: - if (ins->ci_iv_group != NULL) - rc = crt_group_secondary_modify(ins->ci_iv_group, rank_list, rank_list, - CRT_GROUP_MOD_OP_REPLACE, version); - out: - if (rc == 0) { - d_rank_list_free(ins->ci_ranks); - ins->ci_ranks = rank_list; - rank_list = NULL; - } - - d_rank_list_free(rank_list); if (rc != -DER_NOTAPPLICABLE) D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u mark rank %u as dead with gen " @@ -3383,19 +3362,19 @@ chk_engine_notify(struct chk_iv *iv) void chk_engine_rejoin(void *args) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *pools = NULL; - struct chk_iv iv = { 0 }; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - uint32_t pool_nr = 0; - uint32_t flags = 0; - int rc = 0; - int rc1; - bool need_join = false; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + d_rank_list_t *ranks = NULL; + uuid_t *pools = NULL; + struct chk_iv iv = {0}; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + uint32_t pool_nr = 0; + uint32_t flags = 0; + int rc = 0; + int rc1; + bool need_join = false; if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) goto out_log; @@ -3404,7 +3383,7 @@ chk_engine_rejoin(void *args) cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_PAUSED) goto out_log; - /* We do NOT support leader (and its associated engine ) to rejoin former check instance. */ + /* We do NOT support leader (and its associated engine) to rejoin former check instance. */ if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) goto out_log; @@ -3439,22 +3418,10 @@ chk_engine_rejoin(void *args) if (rc != 0) goto out_tree; - uuid_unparse_lower(cbk->cb_iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); - if (rc != 0) - goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, cbk->cb_iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, prop->cp_leader, ins->ci_iv_ns->iv_master_term + 1); - again: /* Ask leader whether this engine can rejoin or not. */ rc = chk_rejoin_remote(prop->cp_leader, cbk->cb_gen, myrank, cbk->cb_iv_uuid, &flags, - &pool_nr, &pools); + &ins->ci_ns_ver, &pool_nr, &pools, &ranks); if (rc != 0) { if ((rc == -DER_OOG || rc == -DER_GRPVER) && !ins->ci_pause) { D_INFO(DF_ENGINE" Someone is not ready %d, let's rejoin after 1 sec\n", @@ -3464,14 +3431,22 @@ chk_engine_rejoin(void *args) goto again; } - goto out_iv; + goto out_tree; } - if (pool_nr == 0) { + if (ranks == NULL || pool_nr == 0) { need_join = false; - D_GOTO(out_iv, rc = 1); + D_GOTO(out_tree, rc = 1); } + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = ranks; + ranks = NULL; + + rc = chk_iv_ns_create(ins, cbk->cb_iv_uuid, prop->cp_leader, ins->ci_ns_ver); + if (rc != 0) + goto out_tree; + rc = chk_pools_load_list(ins, cbk->cb_gen, 0, pool_nr, pools, NULL); if (rc != 0) goto out_notify; @@ -3515,17 +3490,13 @@ chk_engine_rejoin(void *args) D_CDEBUG(rc1 != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc1 = %d\n", DP_ENGINE(ins), myrank, cbk->cb_ins_status, rc1); -out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); out_log: + d_rank_list_free(ranks); + D_FREE(pools); if (need_join) D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" rejoin on rank %u with iv "DF_UUIDF": "DF_RC"\n", diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 6c1d5508260..57fe52d9d5a 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -76,6 +76,7 @@ struct chk_pool_mbs { uint32_t *cpm_tgt_status; }; +/* clang-format off */ /* * CHK_START: * From check leader to check engine to start the check instance on specified pool(s) or all pools. @@ -83,7 +84,7 @@ struct chk_pool_mbs { #define DAOS_ISEQ_CHK_START \ ((uint64_t) (csi_gen) CRT_VAR) \ ((uint32_t) (csi_flags) CRT_VAR) \ - ((int32_t) (csi_phase) CRT_VAR) \ + ((int32_t) (csi_ns_ver) CRT_VAR) \ ((d_rank_t) (csi_leader_rank) CRT_VAR) \ ((uint32_t) (csi_api_flags) CRT_VAR) \ ((uuid_t) (csi_iv_uuid) CRT_VAR) \ @@ -272,11 +273,13 @@ CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); #define DAOS_OSEQ_CHK_REJOIN \ ((int32_t) (cro_status) CRT_VAR) \ ((uint32_t) (cro_flags) CRT_VAR) \ + ((uint32_t) (cro_ns_ver) CRT_VAR) \ + ((uint32_t) (cro_padding) CRT_VAR) \ + ((d_rank_t) (cro_ranks) CRT_ARRAY) \ ((uuid_t) (cro_pools) CRT_ARRAY) CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN); -/* clang-format off */ /* * CHK_SET_POLICY: * From check leader to check engine to set policy during check instance running. @@ -501,16 +504,12 @@ struct chk_bookmark { * 'reset' for all pools. */ struct chk_property { - d_rank_t cp_leader; - Chk__CheckFlag cp_flags; - Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; - /* - * NOTE: Preserve for supporting to continue the check until the specified phase in the - * future. -1 means to check all phases. - */ - int32_t cp_phase; + d_rank_t cp_leader; + Chk__CheckFlag cp_flags; + Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; + uint32_t cp_padding; /* How many ranks (ever or should) take part in the check instance. */ - uint32_t cp_rank_nr; + uint32_t cp_rank_nr; }; /* @@ -576,6 +575,7 @@ struct chk_instance { ci_rejoining:1, ci_implicated:1; uint32_t ci_start_flags; + uint32_t ci_ns_ver; }; struct chk_iv { @@ -762,9 +762,8 @@ void chk_pending_destroy(struct chk_pending_rec *cpr); int chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_property *prop); -int chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, - d_rank_list_t *ranks, struct chk_property *prop); +int chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, + struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop); uint32_t chk_pool_merge_status(uint32_t status_a, uint32_t status_b); @@ -781,7 +780,7 @@ void chk_ins_fini(struct chk_instance **p_ins); int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues); int chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags); @@ -818,6 +817,12 @@ void chk_engine_fini(void); /* chk_iv.c */ +void chk_iv_ns_destroy(struct chk_instance *ins); + +int chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver); + +int chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver); + int chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry); int chk_iv_init(void); @@ -834,8 +839,8 @@ int chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) int chk_leader_notify(struct chk_iv *iv); -int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools); +int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, + uint32_t *ns_ver, int *pool_nr, uuid_t **pools, d_rank_list_t **ranks); int chk_leader_setup(void); @@ -849,8 +854,8 @@ void chk_leader_fini(void); int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, - uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, + uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[], chk_co_rpc_cb_t stop_cb, void *args); @@ -879,7 +884,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq); int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools); + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks); int chk_set_policy_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t policy_nr, struct chk_policy *policies); @@ -1032,17 +1037,6 @@ chk_query_free(struct chk_query_pool_shard *shards, uint32_t shard_nr) } } -static inline void -chk_iv_ns_cleanup(struct ds_iv_ns **ns) -{ - if (*ns != NULL) { - if ((*ns)->iv_refcount == 1) - ds_iv_ns_cleanup(*ns); - ds_iv_ns_put(*ns); - *ns = NULL; - } -} - static inline void chk_pool_get(struct chk_pool_rec *cpr) { @@ -1249,7 +1243,7 @@ chk_ins_can_start(struct chk_instance *ins) if (ins->ci_starting) return -DER_INPROGRESS; - if (ins->ci_stopping || ins->ci_sched_exiting) + if (ins->ci_stopping || ins->ci_sched_exiting || ins->ci_rejoining) return -DER_BUSY; if (ins->ci_sched_running) diff --git a/src/chk/chk_iv.c b/src/chk/chk_iv.c index 299c1554856..a67ae24864c 100644 --- a/src/chk/chk_iv.c +++ b/src/chk/chk_iv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -181,6 +182,62 @@ struct ds_iv_class_ops chk_iv_ops = { .ivc_value_alloc = chk_iv_value_alloc, }; +void +chk_iv_ns_destroy(struct chk_instance *ins) +{ + if (ins->ci_iv_ns != NULL) { + if (ins->ci_iv_ns->iv_refcount == 1) + ds_iv_ns_cleanup(ins->ci_iv_ns); + ds_iv_ns_put(ins->ci_iv_ns); + ins->ci_iv_ns = NULL; + } + + if (ins->ci_iv_group != NULL) { + crt_group_secondary_destroy(ins->ci_iv_group); + ins->ci_iv_group = NULL; + } +} + +int +chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver) +{ + char uuid_str[DAOS_UUID_STR_SIZE]; + int rc; + + uuid_unparse_lower(uuid, uuid_str); + rc = crt_group_secondary_create(uuid_str, NULL, NULL, &ins->ci_iv_group); + if (rc != 0) + goto out; + + rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, uuid, ins->ci_iv_group, &ins->ci_iv_id, + &ins->ci_iv_ns); + if (rc != 0) + goto out; + + rc = chk_iv_ns_update(ins, ns_ver); + if (rc == 0) + ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); + +out: + if (rc != 0) + chk_iv_ns_destroy(ins); + return rc; +} + +int +chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver) +{ + int rc; + + /* Let secondary rank == primary rank. */ + rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, + CRT_GROUP_MOD_OP_REPLACE, ns_ver); + if (rc == 0) + ins->ci_ns_ver = ns_ver; + + return rc; +} + int chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry) { diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index 3f9d54b0d25..49087a269a4 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -2128,7 +2128,7 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) struct chk_pool_shard *tmp; struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; - uint32_t version = cbk->cb_gen - prop->cp_rank_nr - 1; + uint32_t version = ins->ci_ns_ver + 1; int rc = 0; if (!chk_remove_rank_from_list(ins->ci_ranks, cdr->cdr_rank)) @@ -2139,8 +2139,7 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) if (rc != 0) goto out; - rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, - CRT_GROUP_MOD_OP_REPLACE, version); + rc = chk_iv_ns_update(ins, version); if (rc != 0) goto out; @@ -2461,8 +2460,8 @@ chk_leader_ranks_prepare(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *r static int chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], int phase, d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + d_rank_t leader, uint32_t flags) { struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2552,7 +2551,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank cbk->cb_version = chk_ver; init: - rc = chk_prop_prepare(leader, flags, phase, policy_nr, policies, rank_list, prop); + rc = chk_prop_prepare(leader, flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2879,20 +2878,20 @@ chk_leader_start_cb(struct chk_co_rpc_cb_args *cb_args) int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, - int pool_nr, uuid_t pools[], uint32_t api_flags, int phase) + int pool_nr, uuid_t pools[], uint32_t api_flags) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *c_pools = NULL; - struct umem_attr uma = { 0 }; - uuid_t dummy_pool = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - uint64_t old_gen = cbk->cb_gen; - d_rank_t myrank = dss_self_rank(); - uint32_t flags = api_flags; - int c_pool_nr = 0; - int rc; - int rc1; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + uuid_t *c_pools = NULL; + struct umem_attr uma = {0}; + uuid_t dummy_pool = {0}; + uint64_t old_gen = cbk->cb_gen; + d_rank_t myrank = dss_self_rank(); + uint32_t flags = api_flags; + uint32_t ns_ver = (uint32_t)daos_wallclock_secs(); + int c_pool_nr = 0; + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2917,13 +2916,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } - + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; rc = dbtree_create_inplace(DBTREE_CLASS_CHK_RANK, 0, CHK_BTREE_ORDER, &uma, @@ -2942,8 +2935,8 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_tree; reset: - rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, - phase, myrank, flags); + rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, myrank, + flags); if (rc == 1 && !(flags & CHK__CHECK_FLAG__CF_RESET)) { /* Former check instance has done, let's re-start from the beginning. */ flags |= CHK__CHECK_FLAG__CF_RESET; @@ -2957,18 +2950,10 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto remote; uuid_generate(dummy_pool); - uuid_unparse_lower(dummy_pool, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, dummy_pool, myrank, ns_ver); if (rc != 0) goto out_tree; - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, dummy_pool, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, myrank, ins->ci_iv_ns->iv_master_term + 1); - if (d_list_empty(&ins->ci_pool_list)) { c_pool_nr = pool_nr; c_pools = pools; @@ -2980,7 +2965,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c remote: rc = chk_start_remote(ins->ci_ranks, cbk->cb_gen, rank_nr, ranks, policy_nr, policies, - c_pool_nr, c_pools, flags, phase, myrank, ins->ci_start_flags, + c_pool_nr, c_pools, flags, ns_ver, myrank, ins->ci_start_flags, dummy_pool, chk_leader_start_cb, ins); if (rc != 0) { if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { @@ -3023,10 +3008,9 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_stop_pools; } - D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen " DF_X64 - " iv "DF_UUIDF": rc %d\n", - chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, phase, myrank, - ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool), rc); + D_INFO("Leader %s with api_flags %x, leader %u, flags %x, gen " DF_X64 " iv " DF_UUIDF "\n", + chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, myrank, + ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool)); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, c_pool_nr > 0 ? c_pool_nr : pool_nr, @@ -3049,8 +3033,6 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to rollback failed check start: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING || cbk->cb_gen != old_gen) { cbk->cb_gen = old_gen; if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING) { @@ -3062,17 +3044,16 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to update leader bookmark: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); } - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; + chk_iv_ns_destroy(ins); out_tree: chk_leader_destroy_trees(ins); ins->ci_starting = 0; out_log: - D_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, - "Leader %s to start check on %u ranks for %d pools with " - "api_flags %x, phase %d, leader %u, gen "DF_X64": rc = %d\n", - rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, phase, - myrank, cbk->cb_gen, rc); + DL_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, rc, + "Leader %s to start check on %u ranks for %d pools with api_flags %x, ns_ver %d, " + "leader %u, gen " DF_X64, + rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, ns_ver, myrank, + cbk->cb_gen); if (unlikely(rc > 0)) rc = 0; @@ -3826,8 +3807,8 @@ chk_leader_notify(struct chk_iv *iv) } int -chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools) +chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *ns_ver, + int *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { struct chk_instance *ins = chk_leader; struct chk_bookmark *cbk = &ins->ci_bk; @@ -3854,7 +3835,9 @@ chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, if (ins->ci_orphan_done) *flags = CRF_ORPHAN_DONE; - rc = chk_leader_pools2list(ins, pool_nr, pools); + *ns_ver = ins->ci_ns_ver; + *ranks = ins->ci_ranks; + rc = chk_leader_pools2list(ins, pool_nr, pools); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index d81506e5c35..e250936dfc2 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -527,8 +527,8 @@ chk_sg_rpc_prepare(d_rank_t rank, crt_opcode_t opc, crt_rpc_t **req) int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args) { struct chk_co_rpc_cb_args cb_args = { 0 }; @@ -544,12 +544,12 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran if (rc != 0) goto out; - csi = crt_req_get(req); - csi->csi_gen = gen; - csi->csi_flags = flags; - csi->csi_phase = phase; + csi = crt_req_get(req); + csi->csi_gen = gen; + csi->csi_flags = flags; + csi->csi_ns_ver = ns_ver; csi->csi_leader_rank = leader; - csi->csi_api_flags = api_flags; + csi->csi_api_flags = api_flags; uuid_copy(csi->csi_iv_uuid, iv_uuid); csi->csi_ranks.ca_count = rank_nr; csi->csi_ranks.ca_arrays = ranks; @@ -605,9 +605,9 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran crt_req_decref(req); } - D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, - "Rank %u start checker, gen "DF_X64", flags %x, phase %d, iv "DF_UUIDF":"DF_RC"\n", - leader, gen, flags, phase, DP_UUID(iv_uuid), DP_RC(rc)); + DL_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, rc, + "Rank %u start checker, gen " DF_X64 ", flags %x, ns_ver %d, iv " DF_UUIDF, + leader, gen, flags, ns_ver, DP_UUID(iv_uuid)); return rc; } @@ -1019,7 +1019,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools) + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; @@ -1042,8 +1042,22 @@ chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, cro = crt_reply_get(req); rc = cro->cro_status; - if (rc == 0 && cro->cro_pools.ca_count > 0) { - *flags = cro->cro_flags; + if (rc != 0) + goto out; + + *flags = cro->cro_flags; + *ns_ver = cro->cro_ns_ver; + + if (cro->cro_ranks.ca_count > 0) { + *ranks = d_rank_list_alloc(cro->cro_ranks.ca_count); + if (*ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy((*ranks)->rl_ranks, cro->cro_ranks.ca_arrays, + sizeof(d_rank_t) * cro->cro_ranks.ca_count); + } + + if (cro->cro_pools.ca_count > 0) { D_ALLOC(tmp, cro->cro_pools.ca_count); if (tmp == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/chk/chk_srv.c b/src/chk/chk_srv.c index 84d6f3a21bc..d50e3b59657 100644 --- a/src/chk/chk_srv.c +++ b/src/chk/chk_srv.c @@ -27,7 +27,7 @@ ds_chk_start_hdlr(crt_rpc_t *rpc) rc = chk_engine_start(csi->csi_gen, csi->csi_ranks.ca_count, csi->csi_ranks.ca_arrays, csi->csi_policies.ca_count, csi->csi_policies.ca_arrays, csi->csi_uuids.ca_count, csi->csi_uuids.ca_arrays, csi->csi_api_flags, - csi->csi_phase, csi->csi_leader_rank, csi->csi_flags, + csi->csi_ns_ver, csi->csi_leader_rank, csi->csi_flags, csi->csi_iv_uuid, &clues); if (rc > 0) { D_ALLOC_PTR(rank); @@ -249,18 +249,21 @@ ds_chk_report_hdlr(crt_rpc_t *rpc) static void ds_chk_rejoin_hdlr(crt_rpc_t *rpc) { - struct chk_rejoin_in *cri = crt_req_get(rpc); - struct chk_rejoin_out *cro = crt_reply_get(rpc); - uuid_t *pools = NULL; - int pool_nr = 0; - int rc; + struct chk_rejoin_in *cri = crt_req_get(rpc); + struct chk_rejoin_out *cro = crt_reply_get(rpc); + uuid_t *pools = NULL; + d_rank_list_t *ranks = NULL; + int pool_nr = 0; + int rc; rc = chk_leader_rejoin(cri->cri_gen, cri->cri_rank, cri->cri_iv_uuid, &cro->cro_flags, - &pool_nr, &pools); + &cro->cro_ns_ver, &pool_nr, &pools, &ranks); cro->cro_status = rc; if (rc == 0) { - cro->cro_pools.ca_count = pool_nr; + cro->cro_ranks.ca_count = ranks->rl_nr; + cro->cro_ranks.ca_arrays = ranks->rl_ranks; + cro->cro_pools.ca_count = pool_nr; cro->cro_pools.ca_arrays = pools; } diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h index 5756c84232e..9c363c86c9c 100644 --- a/src/include/daos_srv/daos_chk.h +++ b/src/include/daos_srv/daos_chk.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -74,9 +74,9 @@ typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags); -int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - struct chk_policy *policies, int pool_nr, uuid_t pools[], - uint32_t api_flags, int phase); +int +chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, + int pool_nr, uuid_t pools[], uint32_t api_flags); int chk_leader_stop(int pool_nr, uuid_t pools[]); diff --git a/src/mgmt/srv_chk.c b/src/mgmt/srv_chk.c index 705f4f0609e..3dd937d9bff 100644 --- a/src/mgmt/srv_chk.c +++ b/src/mgmt/srv_chk.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -48,7 +48,7 @@ ds_mgmt_chk_parse_uuid(int pool_nr, char **pools, uuid_t **p_uuids) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int32_t pool_nr, char **pools, - uint32_t flags, int32_t phase) + uint32_t flags) { uuid_t *uuids = NULL; struct chk_policy *ply = NULL; @@ -70,7 +70,7 @@ ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, } } - rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags, phase); + rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags); out: D_FREE(uuids); diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index 47202fce0a2..2fedf826efd 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2789,7 +2789,7 @@ ds_mgmt_drpc_check_start(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_INFO("Received request to start check\n"); rc = ds_mgmt_check_start(req->n_ranks, req->ranks, req->n_policies, req->policies, - req->n_uuids, req->uuids, req->flags, -1 /* phase */); + req->n_uuids, req->uuids, req->flags); if (rc < 0) D_ERROR("Failed to start check: "DF_RC"\n", DP_RC(rc)); diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 511cae1b7c4..5c18c47817c 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -139,9 +139,10 @@ int const char *user, const char *group); /** srv_chk.c */ -int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, +int + ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase); + uint32_t flags); int ds_mgmt_check_stop(int pool_nr, char **pools); int ds_mgmt_check_query(int pool_nr, char **pools, chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf); diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index 9d93e8697e7..382616db41f 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -694,7 +694,7 @@ mock_ds_mgmt_dev_set_faulty_setup(void) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase) + uint32_t flags) { return 0; } diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 9821740cf55..774f0e9122a 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -3848,6 +3848,61 @@ cr_maintenance_mode(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Exclude rank 0. + * 2. Create pool without inconsistency. + * 3. Start checker without options. + * 4. Query checker, it should be completed instead of being blocked. + * 5. Switch to normal mode and cleanup. + */ +static void +cr_lost_rank0(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + int rc; + + print_message("CR29: CR with rank 0 excluded at the beginning\n"); + + print_message("CR: excluding the rank 0 ...\n"); + rc = dmg_system_exclude_rank(dmg_config_file, 0); + assert_rc_equal(rc, 0); + + rc = cr_pool_create(state, &pool, false, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + /* Reint the rank for subsequent test. */ + rc = cr_rank_reint(0, true); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + /* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", @@ -3906,6 +3961,8 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool2, async_disable, test_case_teardown}, { "CR28: maintenance mode after dry-run check", cr_maintenance_mode, async_disable, test_case_teardown}, + { "CR29: CR with rank 0 excluded at the beginning", + cr_lost_rank0, async_disable, test_case_teardown}, }; /* clang-format on */