Skip to content

Commit b9c5c97

Browse files
authored
Merge pull request #6201 from garlick/issue#6199
include offline nodes in flux overlay errors output
2 parents 85eaaa9 + e040a7e commit b9c5c97

File tree

3 files changed

+11
-8
lines changed

3 files changed

+11
-8
lines changed

doc/man1/flux-overlay.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,9 @@ errors
8686

8787
.. program:: flux overlay errors
8888

89-
:program:`flux overlay errors` summarizes any errors recorded for lost nodes.
90-
The output consists of one line per unique error with a hostlist prefix.
89+
:program:`flux overlay errors` summarizes any errors recorded for lost or
90+
offline nodes. The output consists of one line per unique error with a
91+
hostlist prefix.
9192

9293
.. option:: -t, --timeout=FSD
9394

src/cmd/builtin/overlay.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -913,8 +913,11 @@ void gather_errors (flux_t *h,
913913
log_msg_exit ("error adding to error hash");
914914
}
915915
else if (streq (status, "offline")) {
916-
/* Don't report offline nodes.
917-
*/
916+
// report offline only if there is error text
917+
if (error && strlen (error) > 0) {
918+
if (errhash_add_one (errhash, child_rank, error) < 0)
919+
log_msg_exit ("error adding to error hash");
920+
}
918921
}
919922
else { // recurse
920923
gather_errors (h, child_rank, errhash, timeout);

t/t3303-system-healthcheck.t

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,9 @@ test_expect_success 'flux overlay status shows rank 3 offline' '
139139
test_cmp health.exp health.out
140140
'
141141

142-
test_expect_success 'flux overlay errors prints nothing' '
142+
test_expect_success 'flux overlay errors shows rank 3' '
143143
flux overlay errors --timeout=0 >errors2.out &&
144-
test $(wc -l <errors2.out) -eq 0
144+
grep "fake3: administrative shutdown" errors2.out
145145
'
146146

147147
test_expect_success 'flux overlay status --summary' '
@@ -202,9 +202,8 @@ test_expect_success 'ping to rank 14 fails with EHOSTUNREACH' '
202202
'
203203

204204
test_expect_success 'flux overlay errors shows the lost connection' '
205-
echo "fake14: lost connection" >errors3.exp &&
206205
flux overlay errors --timeout=0 >errors3.out &&
207-
test_cmp errors3.exp errors3.out
206+
grep "fake14: lost connection" errors3.out
208207
'
209208

210209
test_expect_success 'wait for rank 0 subtree to be degraded' '

0 commit comments

Comments
 (0)