Skip to content

Commit 0857dde

Browse files
committed
DAOS-17535 chk: misc improvements for CR logic - b26
Include the followings: 1. When create CHK IV namespace, make the secondary group to be same as the primary group. Otherwise, CHK logic may hit DER_NONEXIST trouble when communicate via IV. 2. Integrate CHK IV namespace create and destroy API, cleanup related logic, redefine the version. 3. Get ranks list and IV namespace version from CHK leader when rejoin. Adjust CHK_REJOIN RPC for related changes. 4. Remove unsupported functionality for checking the specified 'phase'. 5. Add new test for case of lost some engine(s) before start checker. 6. Dedicated ULT to handle dead rank event, that will not be affected by checker start or stop. Then even if check scheduler exited, the subsequent check query still can work against the latest rank list. Test-tag: recovery Signed-off-by: Fan Yong <fan.yong@hpe.com>
1 parent 45022ee commit 0857dde

23 files changed

+560
-411
lines changed

src/chk/chk_common.c

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,8 +1019,7 @@ chk_pending_destroy(struct chk_pending_rec *cpr)
10191019
}
10201020

10211021
int
1022-
chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
1023-
uint32_t policy_nr, struct chk_policy *policies,
1022+
chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies,
10241023
d_rank_list_t *ranks, struct chk_property *prop)
10251024
{
10261025
int rc = 0;
@@ -1033,11 +1032,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
10331032
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT;
10341033
if (flags & CHK__CHECK_FLAG__CF_NO_AUTO)
10351034
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO;
1036-
prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET |
1037-
CHK__CHECK_FLAG__CF_ORPHAN_POOL |
1038-
CHK__CHECK_FLAG__CF_NO_FAILOUT |
1039-
CHK__CHECK_FLAG__CF_NO_AUTO);
1040-
prop->cp_phase = phase;
1035+
prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL |
1036+
CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO);
10411037
if (ranks != NULL)
10421038
prop->cp_rank_nr = ranks->rl_nr;
10431039

@@ -1195,12 +1191,7 @@ chk_ins_cleanup(struct chk_instance *ins)
11951191
chk_stop_sched(ins);
11961192
ins->ci_inited = 0;
11971193

1198-
chk_iv_ns_cleanup(&ins->ci_iv_ns);
1199-
1200-
if (ins->ci_iv_group != NULL) {
1201-
crt_group_secondary_destroy(ins->ci_iv_group);
1202-
ins->ci_iv_group = NULL;
1203-
}
1194+
chk_iv_ns_destroy(ins);
12041195
}
12051196

12061197
int
@@ -1215,7 +1206,8 @@ chk_ins_init(struct chk_instance **p_ins)
12151206
if (ins == NULL)
12161207
D_GOTO(out_init, rc = -DER_NOMEM);
12171208

1218-
ins->ci_sched = ABT_THREAD_NULL;
1209+
ins->ci_sched = ABT_THREAD_NULL;
1210+
ins->ci_dead_rank_ult = ABT_THREAD_NULL;
12191211

12201212
ins->ci_rank_hdl = DAOS_HDL_INVAL;
12211213
D_INIT_LIST_HEAD(&ins->ci_rank_list);
@@ -1281,6 +1273,8 @@ chk_ins_fini(struct chk_instance **p_ins)
12811273
D_ASSERT(daos_handle_is_inval(ins->ci_pending_hdl));
12821274
D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list));
12831275

1276+
D_ASSERT(ins->ci_dead_rank_ult == ABT_THREAD_NULL);
1277+
12841278
if (ins->ci_sched != ABT_THREAD_NULL)
12851279
ABT_thread_free(&ins->ci_sched);
12861280

0 commit comments

Comments
 (0)