Skip to content

Commit e9b6f9b

Browse files
authored
DAOS-17405 cart: Add env var SWIM_SUBGROUP_SIZE (#16228)
Make the number of iping targets, called the subgroup size, tunable by new environment variable SWIM_SUBGROUP_SIZE. Log new SWIM suspicions at INFO level, including their origins. This may help debugging why a DEAD event happens. Signed-off-by: Li Wei <liwei@hpe.com>
1 parent bf5ae50 commit e9b6f9b

File tree

3 files changed

+54
-40
lines changed

3 files changed

+54
-40
lines changed

src/cart/crt_internal_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ struct crt_event_cb_priv {
241241
ENV(FI_UNIVERSE_SIZE) \
242242
ENV(SWIM_PING_TIMEOUT) \
243243
ENV(SWIM_PROTOCOL_PERIOD_LEN) \
244+
ENV(SWIM_SUBGROUP_SIZE) \
244245
ENV(SWIM_SUSPECT_TIMEOUT) \
245246
ENV_STR(SWIM_TRAFFIC_CLASS) \
246247
ENV_STR(UCX_IB_FORK_INIT)

src/cart/swim/swim.c

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2016 UChicago Argonne, LLC
33
* (C) Copyright 2018-2024 Intel Corporation.
44
* (C) Copyright 2025 Google LLC
5+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
56
*
67
* SPDX-License-Identifier: BSD-2-Clause-Patent
78
*/
@@ -20,6 +21,7 @@ static const char *SWIM_STATUS_STR[] = {
2021
static uint64_t swim_prot_period_len;
2122
static uint64_t swim_suspect_timeout;
2223
static uint64_t swim_ping_timeout;
24+
static int swim_subgroup_size;
2325

2426
static inline uint64_t
2527
swim_prot_period_len_default(void)
@@ -48,6 +50,15 @@ swim_ping_timeout_default(void)
4850
return val;
4951
}
5052

53+
static inline int
54+
swim_subgroup_size_default(void)
55+
{
56+
unsigned int val = SWIM_SUBGROUP_SIZE;
57+
58+
d_getenv_uint("SWIM_SUBGROUP_SIZE", &val);
59+
return val;
60+
}
61+
5162
void
5263
swim_period_set(uint64_t val)
5364
{
@@ -112,9 +123,8 @@ swim_dump_updates(swim_id_t self_id, swim_id_t from_id, swim_id_t to_id,
112123
fclose(fp);
113124
/* msg and msg_size will be set after fclose(fp) only */
114125
if (msg_size > 0)
115-
SWIM_INFO("%lu %s %lu:%s\n", self_id,
116-
self_id == from_id ? "=>" : "<=",
117-
self_id == from_id ? to_id : from_id, msg);
126+
SWIM_DEBUG("%lu %s %lu:%s\n", self_id, self_id == from_id ? "=>" : "<=",
127+
self_id == from_id ? to_id : from_id, msg);
118128
free(msg); /* allocated by open_memstream() */
119129
}
120130
}
@@ -149,7 +159,7 @@ swim_updates_prepare(struct swim_context *ctx, swim_id_t id, swim_id_t to,
149159
rc = ctx->sc_ops->get_member_state(ctx, id, &upds[n].smu_state);
150160
if (rc) {
151161
if (rc == -DER_NONEXIST)
152-
SWIM_INFO("%lu: not bootstrapped yet with %lu\n", self_id, id);
162+
SWIM_DEBUG("%lu: not bootstrapped yet with %lu\n", self_id, id);
153163
else
154164
SWIM_ERROR("get_member_state(%lu): "DF_RC"\n", id, DP_RC(rc));
155165
D_GOTO(out_unlock, rc);
@@ -171,7 +181,7 @@ swim_updates_prepare(struct swim_context *ctx, swim_id_t id, swim_id_t to,
171181
rc = ctx->sc_ops->get_member_state(ctx, to, &upds[n].smu_state);
172182
if (rc) {
173183
if (rc == -DER_NONEXIST)
174-
SWIM_INFO("%lu: not bootstrapped yet with %lu\n", self_id, to);
184+
SWIM_DEBUG("%lu: not bootstrapped yet with %lu\n", self_id, to);
175185
else
176186
SWIM_ERROR("get_member_state(%lu): "DF_RC"\n", to, DP_RC(rc));
177187
D_GOTO(out_unlock, rc);
@@ -295,7 +305,7 @@ swim_member_alive(struct swim_context *ctx, swim_id_t from, swim_id_t id, uint64
295305
swim_id_t self_id = swim_self_get(ctx);
296306

297307
if (rc == -DER_NONEXIST)
298-
SWIM_INFO("%lu: not bootstrapped yet with %lu\n", self_id, id);
308+
SWIM_DEBUG("%lu: not bootstrapped yet with %lu\n", self_id, id);
299309
else
300310
SWIM_ERROR("get_member_state(%lu): "DF_RC"\n", id, DP_RC(rc));
301311
D_GOTO(out, rc);
@@ -327,7 +337,7 @@ swim_member_alive(struct swim_context *ctx, swim_id_t from, swim_id_t id, uint64
327337
}
328338
}
329339

330-
SWIM_INFO("member %lu %lu is ALIVE\n", id, nr);
340+
SWIM_INFO("%lu: member %lu %lu is ALIVE from %lu\n", ctx->sc_self, id, nr, from);
331341
id_state.sms_incarnation = nr;
332342
id_state.sms_status = SWIM_MEMBER_ALIVE;
333343
rc = swim_updates_notify(ctx, from, id, &id_state, count);
@@ -374,7 +384,8 @@ swim_member_dead(struct swim_context *ctx, swim_id_t from, swim_id_t id, uint64_
374384
}
375385
}
376386

377-
SWIM_ERROR("member %lu %lu is DEAD\n", id, nr);
387+
SWIM_ERROR("%lu: member %lu %lu is DEAD from %lu%s\n", ctx->sc_self, id, nr, from,
388+
from == ctx->sc_self ? " (self)" : "");
378389
id_state.sms_incarnation = nr;
379390
id_state.sms_status = SWIM_MEMBER_DEAD;
380391
rc = swim_updates_notify(ctx, from, id, &id_state, 0);
@@ -437,7 +448,8 @@ swim_member_suspect(struct swim_context *ctx, swim_id_t from, swim_id_t id, uint
437448
TAILQ_INSERT_TAIL(&ctx->sc_suspects, item, si_link);
438449

439450
update:
440-
SWIM_INFO("member %lu %lu is SUSPECT\n", id, nr);
451+
SWIM_INFO("%lu: member %lu %lu is SUSPECT from %lu%s\n", ctx->sc_self, id, nr, from,
452+
from == ctx->sc_self ? " (self)" : "");
441453
id_state.sms_incarnation = nr;
442454
id_state.sms_status = SWIM_MEMBER_SUSPECT;
443455
rc = swim_updates_notify(ctx, from, id, &id_state, 0);
@@ -472,8 +484,7 @@ swim_member_update_suspected(struct swim_context *ctx, uint64_t now, uint64_t ne
472484
D_GOTO(next_item, rc = 0);
473485
}
474486

475-
SWIM_INFO("%lu: suspect timeout %lu\n",
476-
self_id, item->si_id);
487+
SWIM_DEBUG("%lu: suspect timeout %lu\n", self_id, item->si_id);
477488
if (item->si_from != self_id) {
478489
/* let's try to confirm from gossip origin */
479490
id = item->si_id;
@@ -512,8 +523,8 @@ swim_member_update_suspected(struct swim_context *ctx, uint64_t now, uint64_t ne
512523
while (item != NULL) {
513524
next = TAILQ_NEXT(item, si_link);
514525

515-
SWIM_INFO("try to confirm from source. %lu: %lu <= %lu\n",
516-
self_id, item->si_id, item->si_from);
526+
SWIM_DEBUG("try to confirm from source. %lu: %lu <= %lu\n", self_id, item->si_id,
527+
item->si_from);
517528

518529
rc = swim_updates_send(ctx, item->si_id, item->si_from);
519530
if (rc)
@@ -556,8 +567,8 @@ swim_ipings_update(struct swim_context *ctx, uint64_t now, uint64_t net_glitch_d
556567
item = TAILQ_FIRST(&targets);
557568
while (item != NULL) {
558569
next = TAILQ_NEXT(item, si_link);
559-
SWIM_INFO("reply IREQ expired. %lu: %lu => %lu\n",
560-
self_id, item->si_from, item->si_id);
570+
SWIM_DEBUG("reply IREQ expired. %lu: %lu => %lu\n", self_id, item->si_from,
571+
item->si_id);
561572

562573
rc = ctx->sc_ops->send_reply(ctx, item->si_id, item->si_from,
563574
-DER_TIMEDOUT, item->si_args);
@@ -596,8 +607,7 @@ swim_ipings_reply(struct swim_context *ctx, swim_id_t to_id, int ret_rc)
596607
item = TAILQ_FIRST(&targets);
597608
while (item != NULL) {
598609
next = TAILQ_NEXT(item, si_link);
599-
SWIM_INFO("reply IREQ. %lu: %lu <= %lu\n",
600-
self_id, item->si_id, item->si_from);
610+
SWIM_DEBUG("reply IREQ. %lu: %lu <= %lu\n", self_id, item->si_id, item->si_from);
601611

602612
rc = ctx->sc_ops->send_reply(ctx, item->si_id, item->si_from,
603613
ret_rc, item->si_args);
@@ -649,7 +659,7 @@ swim_subgroup_init(struct swim_context *ctx)
649659
swim_id_t id;
650660
int i, rc = 0;
651661

652-
for (i = 0; i < SWIM_SUBGROUP_SIZE; i++) {
662+
for (i = 0; i < swim_subgroup_size; i++) {
653663
id = ctx->sc_ops->get_iping_target(ctx);
654664
if (id == SWIM_ID_INVALID)
655665
D_GOTO(out, rc = 0);
@@ -736,6 +746,7 @@ swim_init(swim_id_t self_id, struct swim_ops *swim_ops, void *data)
736746
swim_prot_period_len = swim_prot_period_len_default();
737747
swim_suspect_timeout = swim_suspect_timeout_default();
738748
swim_ping_timeout = swim_ping_timeout_default();
749+
swim_subgroup_size = swim_subgroup_size_default();
739750

740751
ctx->sc_default_ping_timeout = swim_ping_timeout;
741752

@@ -911,12 +922,12 @@ swim_progress(struct swim_context *ctx, int64_t timeout_us)
911922
target_id = ctx->sc_target;
912923
sendto_id = ctx->sc_target;
913924
send_updates = true;
914-
SWIM_INFO("%lu: dping %lu => {%lu %c %lu} "
915-
"delay: %u ms, timeout: %lu ms\n",
916-
ctx->sc_self, ctx->sc_self, sendto_id,
917-
SWIM_STATUS_CHARS[target_state.sms_status],
918-
target_state.sms_incarnation,
919-
target_state.sms_delay, delay);
925+
SWIM_DEBUG("%lu: dping %lu => {%lu %c %lu} "
926+
"delay: %u ms, timeout: %lu ms\n",
927+
ctx->sc_self, ctx->sc_self, sendto_id,
928+
SWIM_STATUS_CHARS[target_state.sms_status],
929+
target_state.sms_incarnation, target_state.sms_delay,
930+
delay);
920931

921932
ctx->sc_next_tick_time = now + swim_period_get();
922933
ctx->sc_deadline = now + delay;
@@ -992,23 +1003,22 @@ swim_progress(struct swim_context *ctx, int64_t timeout_us)
9921003
goto done_item;
9931004

9941005
delay *= 2;
995-
SWIM_INFO("%lu: ireq %lu => {%lu %c %lu} "
996-
"delay: %u ms, timeout: %lu ms\n",
997-
ctx->sc_self, sendto_id, target_id,
998-
SWIM_STATUS_CHARS[target_state.sms_status],
999-
target_state.sms_incarnation,
1000-
target_state.sms_delay, delay);
1006+
SWIM_DEBUG("%lu: ireq %lu => {%lu %c %lu} "
1007+
"delay: %u ms, timeout: %lu ms\n",
1008+
ctx->sc_self, sendto_id, target_id,
1009+
SWIM_STATUS_CHARS[target_state.sms_status],
1010+
target_state.sms_incarnation,
1011+
target_state.sms_delay, delay);
10011012
} else {
10021013
/* Send ping only if this member is not respond yet */
10031014
if (state.sms_status != SWIM_MEMBER_INACTIVE)
10041015
goto done_item;
10051016

1006-
SWIM_INFO("%lu: dping %lu => {%lu %c %lu} "
1007-
"delay: %u ms, timeout: %lu ms\n",
1008-
ctx->sc_self, ctx->sc_self, sendto_id,
1009-
SWIM_STATUS_CHARS[state.sms_status],
1010-
state.sms_incarnation,
1011-
state.sms_delay, delay);
1017+
SWIM_DEBUG("%lu: dping %lu => {%lu %c %lu} "
1018+
"delay: %u ms, timeout: %lu ms\n",
1019+
ctx->sc_self, ctx->sc_self, sendto_id,
1020+
SWIM_STATUS_CHARS[state.sms_status],
1021+
state.sms_incarnation, state.sms_delay, delay);
10121022
}
10131023

10141024
send_updates = true;
@@ -1097,7 +1107,8 @@ swim_updates_parse(struct swim_context *ctx, swim_id_t from_id, swim_id_t id,
10971107
rc = ctx->sc_ops->get_member_state(ctx, from_id, &id_state);
10981108
if (rc == -DER_NONEXIST || id_state.sms_status == SWIM_MEMBER_DEAD) {
10991109
swim_ctx_unlock(ctx);
1100-
SWIM_INFO("%lu: skip untrustable update from %lu, rc = %d\n", self_id, from_id, rc);
1110+
SWIM_DEBUG("%lu: skip untrustable update from %lu, rc = %d\n", self_id, from_id,
1111+
rc);
11011112
D_GOTO(out, rc = -DER_NONEXIST);
11021113
} else if (rc != 0) {
11031114
swim_ctx_unlock(ctx);
@@ -1108,8 +1119,8 @@ swim_updates_parse(struct swim_context *ctx, swim_id_t from_id, swim_id_t id,
11081119
if ((from_id == ctx->sc_target || id == ctx->sc_target) &&
11091120
(ctx_state == SCS_BEGIN || ctx_state == SCS_PINGED || ctx_state == SCS_IPINGED)) {
11101121
ctx_state = SCS_SELECT;
1111-
SWIM_INFO("target %lu %s okay\n", ctx->sc_target,
1112-
from_id == id ? "dping" : "iping");
1122+
SWIM_DEBUG("target %lu %s okay\n", ctx->sc_target,
1123+
from_id == id ? "dping" : "iping");
11131124
}
11141125

11151126
for (i = 0; i < nupds; i++) {

src/cart/swim/swim_internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2016 UChicago Argonne, LLC
33
* (C) Copyright 2018-2023 Intel Corporation.
4+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
45
*
56
* SPDX-License-Identifier: BSD-2-Clause-Patent
67
*/
@@ -26,7 +27,8 @@
2627
#include <gurt/common.h>
2728

2829
/* Use debug capability from CaRT */
29-
#define SWIM_INFO(fmt, ...) D_DEBUG(DLOG_DBG, fmt, ##__VA_ARGS__)
30+
#define SWIM_DEBUG(fmt, ...) D_DEBUG(DLOG_DBG, fmt, ##__VA_ARGS__)
31+
#define SWIM_INFO(fmt, ...) D_DEBUG(DLOG_INFO, fmt, ##__VA_ARGS__)
3032
#define SWIM_ERROR(fmt, ...) D_DEBUG(DLOG_ERR, fmt, ##__VA_ARGS__)
3133

3234
#ifdef _USE_ABT_SYNC_

0 commit comments

Comments
 (0)