Skip to content

Commit e47d35d

Browse files
committed
testsuite: cover flub bootstrap
Problem: there is no test coverage for adding brokers to a flux instance. Add some tests.
1 parent a70f717 commit e47d35d

File tree

2 files changed

+194
-0
lines changed

2 files changed

+194
-0
lines changed

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ TESTSCRIPTS = \
9090
t0023-jobspec1-validate.t \
9191
t0026-flux-R.t \
9292
t0033-size-override.t \
93+
t0034-flub.t \
9394
t1000-kvs.t \
9495
t1001-kvs-internals.t \
9596
t1003-kvs-stress.t \

t/t0034-flub.t

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
#!/bin/sh
2+
#
3+
4+
test_description='Test flub bootstrap method'
5+
6+
. `dirname $0`/sharness.sh
7+
8+
test_under_flux 8 full
9+
10+
export FLUX_URI_RESOLVE_LOCAL=t
11+
12+
# usage: get_job_uri id
13+
get_job_uri() {
14+
flux job wait-event -t10 $1 memo >/dev/null && flux uri $1
15+
}
16+
17+
# usage: wait_for_service uri name
18+
wait_for_service() {
19+
flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\"
20+
}
21+
22+
test_expect_success 'broker fails with bad broker.boot-server' '
23+
test_must_fail flux broker \
24+
-Sbroker.rc1_path= -Sbroker.rc3_path= \
25+
-Sbroker.boot-server=local://noexist/path \
26+
/bin/true 2>server.err &&
27+
grep "was not found" server.err
28+
'
29+
30+
test_expect_success 'start a 1 node job with 0 extra ranks' '
31+
id=$(flux batch -N1 --wrap sleep inf) &&
32+
get_job_uri $id >test1.uri
33+
'
34+
test_expect_success 'job has size 1' '
35+
size=$(flux proxy $(cat test1.uri) flux getattr size) &&
36+
test $size -eq 1
37+
'
38+
test_expect_success 'flub bootstrap fails with no available ranks' '
39+
test_must_fail flux broker \
40+
-Sbroker.boot-server=$(cat test1.uri) 2>noranks.err &&
41+
grep "no available ranks" noranks.err
42+
'
43+
test_expect_success 'clean up' '
44+
flux cancel --all
45+
'
46+
47+
48+
#
49+
# Start 2 node batch job with one extra slot.
50+
# Submit 1 node broker job that fills the slot.
51+
# Run a parallel job across all three nodes in the batch job.
52+
# This test is constrained so that all flubbed nodes are leaf nodes,
53+
# and the flubbed nodes connect to rank 0 only.
54+
55+
test_expect_success 'create config with 3 fake nodes' '
56+
cat >fake3.toml <<-EOT
57+
[resource]
58+
noverify = true
59+
[[resource.config]]
60+
hosts = "a,b,c"
61+
cores = "0-3"
62+
EOT
63+
'
64+
test_expect_success 'start a 2 node job with 1 extra rank' '
65+
id=$(flux batch -N2 \
66+
--broker-opts=--config-path=fake3.toml \
67+
--broker-opts=-Ssize=3 \
68+
--broker-opts=-Sbroker.quorum=2 \
69+
--broker-opts=-Stbon.topo=kary:0 \
70+
--wrap sleep inf) &&
71+
get_job_uri $id >test2.uri
72+
'
73+
test_expect_success 'job has size 3' '
74+
size=$(flux proxy $(cat test2.uri) flux getattr size) &&
75+
test $size -eq 3
76+
'
77+
test_expect_success 'overlay status shows extra node offline' '
78+
flux proxy $(cat test2.uri) \
79+
flux overlay status --no-pretty >ov2.out &&
80+
grep "2 extra0: offline" ov2.out
81+
'
82+
test_expect_success 'run a 2 node job in the initial instance' '
83+
wait_for_service $(cat test2.uri) job-ingest &&
84+
run_timeout 30 flux proxy $(cat test2.uri) \
85+
flux run --label-io -N2 flux pmi barrier
86+
'
87+
test_expect_success 'submit a job that starts 1 extra broker' '
88+
id=$(flux submit -N1 flux broker \
89+
--config-path=fake3.toml \
90+
-Stbon.topo=kary:0 \
91+
-Sbroker.boot-server=$(cat test2.uri)) &&
92+
flux job wait-event -p guest.exec.eventlog $id shell.start
93+
'
94+
test_expect_success 'wait for overlay status to be full' '
95+
flux proxy $(cat test2.uri) \
96+
flux overlay status --summary --wait full --timeout 10s
97+
'
98+
test_expect_success 'run a 3 node job in the expanded instance' '
99+
run_timeout 30 flux proxy $(cat test2.uri) \
100+
flux run --label-io -N3 flux pmi barrier
101+
'
102+
test_expect_success 'clean up' '
103+
flux cancel --all
104+
'
105+
106+
test_expect_success 'create config with 7 fake nodes' '
107+
cat >fake7.toml <<-EOT
108+
[resource]
109+
noverify = true
110+
[[resource.config]]
111+
hosts = "a,b,c,d,e,f,g"
112+
cores = "0-3"
113+
EOT
114+
'
115+
116+
#
117+
# Start 1 node batch job with 6 extra slots (kary:2).
118+
# Submit 6 node broker job that fills all the slots.
119+
# Run a 7 node parallel job.
120+
#
121+
test_expect_success 'start a 1 node job with 6 extra ranks' '
122+
id=$(flux batch -N1 \
123+
--broker-opts=--config-path=fake7.toml \
124+
--broker-opts=-Ssize=7 \
125+
--broker-opts=-Sbroker.quorum=1 \
126+
--broker-opts=-Stbon.topo=kary:2 \
127+
--wrap sleep inf) &&
128+
get_job_uri $id >test5.uri
129+
'
130+
test_expect_success 'run a 1 node job in the initial instance' '
131+
wait_for_service $(cat test5.uri) job-ingest &&
132+
run_timeout 30 flux proxy $(cat test5.uri) \
133+
flux run --label-io -N1 flux pmi barrier
134+
'
135+
test_expect_success 'job has size 7' '
136+
size=$(flux proxy $(cat test5.uri) flux getattr size) &&
137+
test $size -eq 7
138+
'
139+
# N.B. include exit-timeout=none so we can safely disconnect one node later
140+
test_expect_success 'submit a job that starts 6 extra brokers' '
141+
id=$(flux submit -N6 -o exit-timeout=none \
142+
flux broker \
143+
--config-path=fake7.toml \
144+
-Stbon.topo=kary:2 \
145+
-Sbroker.boot-server=$(cat test5.uri)) &&
146+
flux job wait-event -p guest.exec.eventlog $id shell.start &&
147+
echo $id >xtra_id
148+
'
149+
test_expect_success 'wait for overlay status to be full' '
150+
flux proxy $(cat test5.uri) \
151+
flux overlay status --summary --wait full --timeout 10s
152+
'
153+
test_expect_success 'run a 7 node job in the expanded instance' '
154+
run_timeout 30 flux proxy $(cat test5.uri) \
155+
flux run --label-io -N7 flux pmi barrier
156+
'
157+
158+
#
159+
# Show that a node can be replaced
160+
161+
test_expect_success 'disconnect rank 6' '
162+
flux proxy $(cat test5.uri) \
163+
flux overlay disconnect 6
164+
'
165+
test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' '
166+
test_must_fail flux proxy $(cat test5.uri) \
167+
flux ping -c1 6
168+
'
169+
test_expect_success 'wait for overlay status to be degraded' '
170+
flux proxy $(cat test5.uri) \
171+
flux overlay status --summary --wait degraded --timeout 10s
172+
'
173+
test_expect_success 'submit a job that starts 1 broker' '
174+
id=$(flux submit -N1 flux broker \
175+
--config-path=fake7.toml \
176+
-Stbon.topo=kary:2 \
177+
-Sbroker.boot-server=$(cat test5.uri)) &&
178+
flux job wait-event -p guest.exec.eventlog $id shell.start
179+
'
180+
test_expect_success 'wait for overlay status to be full' '
181+
flux proxy $(cat test5.uri) \
182+
flux overlay status --summary --wait full --timeout 10s
183+
'
184+
test_expect_success 'run a 7 node job in the expanded instance' '
185+
run_timeout 30 flux proxy $(cat test5.uri) \
186+
flux run --label-io -N7 flux pmi barrier
187+
'
188+
189+
test_expect_success 'clean up' '
190+
flux cancel --all
191+
'
192+
193+
test_done

0 commit comments

Comments
 (0)