Skip to content

Commit 561ec6f

Browse files
authored
Merge pull request #1183 from grondo/issue#1182
resource: ensure all resources start in DOWN state when some ranks are excluded by configuration
2 parents 17f0ed1 + a5607c3 commit 561ec6f

File tree

2 files changed

+73
-8
lines changed

2 files changed

+73
-8
lines changed

resource/modules/resource_match.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,14 +1144,10 @@ static int grow_resource_db (std::shared_ptr<resource_ctx_t> &ctx,
11441144
static int decode_all (std::shared_ptr<resource_ctx_t> &ctx,
11451145
std::set<int64_t> &ranks)
11461146
{
1147-
int64_t size = ctx->db->metadata.by_rank.size();
1148-
1149-
for (int64_t rank = 0; rank < size; ++rank) {
1150-
auto ret = ranks.insert (rank);
1151-
if (!ret.second) {
1152-
errno = EEXIST;
1153-
return -1;
1154-
}
1147+
ranks.clear ();
1148+
for (auto const& kv: ctx->db->metadata.by_rank) {
1149+
if (kv.first >= 0)
1150+
ranks.insert (kv.first);
11551151
}
11561152
return 0;
11571153
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash -e
2+
#
3+
# Ensure Fluxion marks all ranks down even if some ranks are excluded
4+
#
5+
6+
log() { printf "issue#1182: $@\n" >&2; }
7+
8+
# Need a few ranks for this test, so start a new instance of size=4
9+
if test "$ISSUE_1182_ACTIVE" != "t"; then
10+
export ISSUE_1182_ACTIVE=t
11+
log "Re-launching test script under flux-start"
12+
exec flux start -s 4 $0
13+
fi
14+
15+
cat <<'EOF' >rcheck.py
16+
import sys
17+
import flux
18+
from flux.resource.list import ResourceListRPC
19+
20+
h = flux.Flux()
21+
22+
rpc1 = ResourceListRPC(h, "resource.sched-status", nodeid=0)
23+
rpc2 = ResourceListRPC(h, "sched.resource-status", nodeid=0)
24+
25+
rset = rpc1.get()
26+
fluxion = rpc2.get()
27+
28+
def symmetric_diff(a, b):
29+
return (a|b) - (a&b)
30+
31+
diff = symmetric_diff(rset.down, fluxion.down)
32+
if diff.ranks:
33+
print("difference detected between fluxion and core down ranks:")
34+
print(f"hosts: {diff.nodelist}")
35+
print(f"ranks: {diff.ranks}")
36+
sys.exit(1)
37+
sys.exit(0)
38+
EOF
39+
40+
log "Unloading modules..."
41+
flux module remove sched-simple
42+
flux module remove resource
43+
44+
# Exclude rank 0
45+
flux config load <<EOF
46+
[resource]
47+
exclude = "0,2"
48+
EOF
49+
50+
flux module load resource monitor-force-up
51+
52+
# Drain rank 3. Scheduler should only see rank 1 as up
53+
log "draining rank 3"
54+
flux resource drain 3
55+
56+
flux resource status
57+
58+
flux module load sched-fluxion-resource
59+
flux module load sched-fluxion-qmanager
60+
61+
log "comparing fluxion down ranks with flux-core resource module:"
62+
flux resource list
63+
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
64+
flux python ./rcheck.py
65+
66+
log "reloading sched-simple..."
67+
flux module remove sched-fluxion-qmanager
68+
flux module remove sched-fluxion-resource
69+
flux module load sched-simple

0 commit comments

Comments
 (0)