|
1 | 1 | /* |
2 | 2 | * (C) Copyright 2019-2024 Intel Corporation. |
| 3 | + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP |
3 | 4 | * |
4 | 5 | * SPDX-License-Identifier: BSD-2-Clause-Patent |
5 | 6 | */ |
@@ -179,6 +180,12 @@ crt_swim_membs_iterate(struct crt_swim_membs *csm, d_hash_traverse_cb_t cb, void |
179 | 180 | return d_hash_table_traverse(csm->csm_table, cb, arg); |
180 | 181 | } |
181 | 182 |
|
| 183 | +static inline bool |
| 184 | +crt_swim_status_alive_or_suspect(enum swim_member_status status) |
| 185 | +{ |
| 186 | + return status == SWIM_MEMBER_ALIVE || status == SWIM_MEMBER_SUSPECT; |
| 187 | +} |
| 188 | + |
182 | 189 | /* Move cst into the csm. */ |
183 | 190 | static int |
184 | 191 | crt_swim_membs_add(struct crt_swim_membs *csm, struct crt_swim_target *cst) |
@@ -214,6 +221,9 @@ crt_swim_membs_add(struct crt_swim_membs *csm, struct crt_swim_target *cst) |
214 | 221 | if (csm->csm_target == CRT_SWIM_TARGET_INVALID) |
215 | 222 | csm->csm_target = 0; |
216 | 223 |
|
| 224 | + if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status)) |
| 225 | + csm->csm_alive_or_suspect_count++; |
| 226 | + |
217 | 227 | return 0; |
218 | 228 | } |
219 | 229 |
|
@@ -256,6 +266,9 @@ crt_swim_membs_del(struct crt_swim_membs *csm, d_rank_t rank) |
256 | 266 | deleted = d_hash_rec_delete_at(csm->csm_table, &cst->cst_link); |
257 | 267 | D_ASSERT(deleted); |
258 | 268 |
|
| 269 | + if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status)) |
| 270 | + csm->csm_alive_or_suspect_count--; |
| 271 | + |
259 | 272 | return cst; |
260 | 273 | } |
261 | 274 |
|
@@ -952,12 +965,12 @@ static int crt_swim_set_member_state(struct swim_context *ctx, |
952 | 965 | crt_swim_csm_lock(csm); |
953 | 966 | cst = crt_swim_membs_find(csm, id); |
954 | 967 | if (cst != NULL && state->sms_incarnation >= cst->cst_state.sms_incarnation) { |
955 | | - if (cst->cst_state.sms_status != SWIM_MEMBER_ALIVE && |
956 | | - state->sms_status == SWIM_MEMBER_ALIVE) |
957 | | - csm->csm_alive_count++; |
958 | | - else if (cst->cst_state.sms_status == SWIM_MEMBER_ALIVE && |
959 | | - state->sms_status != SWIM_MEMBER_ALIVE) |
960 | | - csm->csm_alive_count--; |
| 968 | + if (!crt_swim_status_alive_or_suspect(cst->cst_state.sms_status) && |
| 969 | + crt_swim_status_alive_or_suspect(state->sms_status)) |
| 970 | + csm->csm_alive_or_suspect_count++; |
| 971 | + else if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status) && |
| 972 | + !crt_swim_status_alive_or_suspect(state->sms_status)) |
| 973 | + csm->csm_alive_or_suspect_count--; |
961 | 974 | state_prev = cst->cst_state; |
962 | 975 | cst->cst_state = *state; |
963 | 976 | rc = 0; |
@@ -1057,7 +1070,7 @@ static int64_t crt_swim_progress_cb(crt_context_t crt_ctx, int64_t timeout_us, v |
1057 | 1070 | * The max_delay should be less suspicion timeout to guarantee |
1058 | 1071 | * the already suspected members will not be expired. |
1059 | 1072 | */ |
1060 | | - if (csm->csm_alive_count > 2) { |
| 1073 | + if (csm->csm_alive_or_suspect_count > 2) { |
1061 | 1074 | uint64_t hlc1 = csm->csm_last_unpack_hlc; |
1062 | 1075 | uint64_t hlc2 = d_hlc_get(); |
1063 | 1076 | uint64_t delay = d_hlc2msec(hlc2 - hlc1); |
@@ -1150,7 +1163,7 @@ int crt_swim_init(int crt_ctx_idx) |
1150 | 1163 |
|
1151 | 1164 | csm->csm_crt_ctx_idx = crt_ctx_idx; |
1152 | 1165 | csm->csm_last_unpack_hlc = hlc; |
1153 | | - csm->csm_alive_count = 0; |
| 1166 | + csm->csm_alive_or_suspect_count = 0; |
1154 | 1167 | csm->csm_nglitches = 0; |
1155 | 1168 | csm->csm_nmessages = 0; |
1156 | 1169 | /* |
@@ -1350,31 +1363,42 @@ void crt_swim_disable_all(void) |
1350 | 1363 | old_ctx_idx, NULL); |
1351 | 1364 | } |
1352 | 1365 |
|
| 1366 | +struct crt_swim_suspend_arg { |
| 1367 | + struct crt_swim_membs *csm; |
| 1368 | + swim_id_t self_id; |
| 1369 | +}; |
| 1370 | + |
1353 | 1371 | static int |
1354 | | -crt_swim_suspend_cb(d_list_t *link, void *arg) |
| 1372 | +crt_swim_suspend_cb(d_list_t *link, void *varg) |
1355 | 1373 | { |
1356 | | - struct crt_swim_target *cst = crt_swim_target_obj(link); |
1357 | | - swim_id_t *self_id = arg; |
| 1374 | + struct crt_swim_target *cst = crt_swim_target_obj(link); |
| 1375 | + struct crt_swim_suspend_arg *arg = varg; |
1358 | 1376 |
|
1359 | | - if (cst->cst_id != *self_id) |
| 1377 | + if (cst->cst_id != arg->self_id) { |
| 1378 | + if (crt_swim_status_alive_or_suspect(cst->cst_state.sms_status)) |
| 1379 | + arg->csm->csm_alive_or_suspect_count--; |
1360 | 1380 | cst->cst_state.sms_status = SWIM_MEMBER_INACTIVE; |
| 1381 | + } |
1361 | 1382 | return 0; |
1362 | 1383 | } |
1363 | 1384 |
|
1364 | 1385 | void crt_swim_suspend_all(void) |
1365 | 1386 | { |
1366 | | - struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp; |
1367 | | - struct crt_swim_membs *csm = &grp_priv->gp_membs_swim; |
1368 | | - swim_id_t self_id; |
1369 | | - int rc; |
| 1387 | + struct crt_grp_priv *grp_priv = crt_gdata.cg_grp->gg_primary_grp; |
| 1388 | + struct crt_swim_membs *csm = &grp_priv->gp_membs_swim; |
| 1389 | + struct crt_swim_suspend_arg arg; |
| 1390 | + int rc; |
1370 | 1391 |
|
1371 | 1392 | if (!crt_gdata.cg_swim_inited) |
1372 | 1393 | return; |
1373 | 1394 |
|
1374 | 1395 | csm->csm_ctx->sc_glitch = 1; |
1375 | | - self_id = swim_self_get(csm->csm_ctx); |
| 1396 | + |
| 1397 | + arg.csm = csm; |
| 1398 | + arg.self_id = swim_self_get(csm->csm_ctx); |
| 1399 | + |
1376 | 1400 | crt_swim_csm_lock(csm); |
1377 | | - rc = crt_swim_membs_iterate(csm, crt_swim_suspend_cb, &self_id); |
| 1401 | + rc = crt_swim_membs_iterate(csm, crt_swim_suspend_cb, &arg); |
1378 | 1402 | D_ASSERTF(rc == 0, "suspend SWIM members: "DF_RC"\n", DP_RC(rc)); |
1379 | 1403 | crt_swim_csm_unlock(csm); |
1380 | 1404 | } |
@@ -1625,6 +1649,8 @@ crt_swim_rank_check(struct crt_grp_priv *grp_priv, d_rank_t rank, uint64_t incar |
1625 | 1649 | if (cst->cst_state.sms_incarnation < incarnation) { |
1626 | 1650 | state_prev = cst->cst_state; |
1627 | 1651 | cst->cst_state.sms_incarnation = incarnation; |
| 1652 | + if (!crt_swim_status_alive_or_suspect(cst->cst_state.sms_status)) |
| 1653 | + csm->csm_alive_or_suspect_count++; |
1628 | 1654 | cst->cst_state.sms_status = SWIM_MEMBER_ALIVE; |
1629 | 1655 | state = cst->cst_state; |
1630 | 1656 | updated = true; |
|
0 commit comments