Skip to content

Commit e5f8fe9

Browse files
committed
DAOS-17535 chk: misc improvements for CR logic
Include the followings: 1. When create CHK IV namespace, make the secondary group to be same as the primary group. Otherwise, CHK logic may hit DER_NONEXIST trouble when communicate via IV. 2. Integrate CHK IV namespace create and destroy API, cleanup related logic, redefine the version. 3. Get ranks list and IV namespace version from CHK leader when rejoin. Adjust CHK_REJOIN RPC for related changes. 4. Remove unsupported functionality for checking the specified 'phase'. 5. Add new test for case of lost some engine(s) before start checker. 6. Dedicated ULT to handle dead rank event, that will not be affected by checker start or stop. Then even if check scheduler exited, the subsequent check query still can work against the latest rank list. Signed-off-by: Fan Yong <[email protected]>
1 parent a78d363 commit e5f8fe9

24 files changed

+530
-397
lines changed

src/chk/chk_common.c

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,8 +1073,7 @@ chk_policy_refresh(uint32_t policy_nr, struct chk_policy *policies, struct chk_p
10731073
}
10741074

10751075
int
1076-
chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
1077-
uint32_t policy_nr, struct chk_policy *policies,
1076+
chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies,
10781077
d_rank_list_t *ranks, struct chk_property *prop)
10791078
{
10801079
int rc = 0;
@@ -1086,11 +1085,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
10861085
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT;
10871086
if (flags & CHK__CHECK_FLAG__CF_NO_AUTO)
10881087
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO;
1089-
prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET |
1090-
CHK__CHECK_FLAG__CF_ORPHAN_POOL |
1091-
CHK__CHECK_FLAG__CF_NO_FAILOUT |
1092-
CHK__CHECK_FLAG__CF_NO_AUTO);
1093-
prop->cp_phase = phase;
1088+
prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL |
1089+
CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO);
10941090
if (ranks != NULL)
10951091
prop->cp_rank_nr = ranks->rl_nr;
10961092

@@ -1240,12 +1236,7 @@ chk_ins_cleanup(struct chk_instance *ins)
12401236
chk_stop_sched(ins);
12411237
ins->ci_inited = 0;
12421238

1243-
chk_iv_ns_cleanup(&ins->ci_iv_ns);
1244-
1245-
if (ins->ci_iv_group != NULL) {
1246-
crt_group_secondary_destroy(ins->ci_iv_group);
1247-
ins->ci_iv_group = NULL;
1248-
}
1239+
chk_iv_ns_destroy(ins);
12491240
}
12501241

12511242
int
@@ -1260,7 +1251,8 @@ chk_ins_init(struct chk_instance **p_ins)
12601251
if (ins == NULL)
12611252
D_GOTO(out_init, rc = -DER_NOMEM);
12621253

1263-
ins->ci_sched = ABT_THREAD_NULL;
1254+
ins->ci_sched = ABT_THREAD_NULL;
1255+
ins->ci_dead_rank_ult = ABT_THREAD_NULL;
12641256

12651257
ins->ci_rank_hdl = DAOS_HDL_INVAL;
12661258
D_INIT_LIST_HEAD(&ins->ci_rank_list);
@@ -1332,6 +1324,8 @@ chk_ins_fini(struct chk_instance **p_ins)
13321324
D_ASSERT(d_list_empty(&ins->ci_interaction_filter_list));
13331325
D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list));
13341326

1327+
D_ASSERT(ins->ci_dead_rank_ult == ABT_THREAD_NULL);
1328+
13351329
if (ins->ci_sched != ABT_THREAD_NULL)
13361330
ABT_thread_free(&ins->ci_sched);
13371331

0 commit comments

Comments
 (0)