Skip to content

Commit 8fb59d9

Browse files
committed
testsuite: cover flub bootstrap
Problem: there is no test coverage for adding brokers to a flux instance. Add some tests.
1 parent ef31923 commit 8fb59d9

File tree

2 files changed

+265
-0
lines changed

2 files changed

+265
-0
lines changed

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ TESTSCRIPTS = \
9090
t0023-jobspec1-validate.t \
9191
t0026-flux-R.t \
9292
t0033-size-override.t \
93+
t0034-flub.t \
9394
t1000-kvs.t \
9495
t1001-kvs-internals.t \
9596
t1003-kvs-stress.t \

t/t0034-flub.t

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#!/bin/sh
2+
#
3+
4+
test_description='Test flub bootstrap method'
5+
6+
. `dirname $0`/sharness.sh
7+
8+
test_under_flux 8 full
9+
10+
export FLUX_SSH="${SHARNESS_TEST_SRCDIR}/scripts/tssh"
11+
12+
# usage: get_job_uri id
13+
get_job_uri() {
14+
flux job wait-event -t10 $1 memo >/dev/null && flux uri $1
15+
}
16+
17+
# usage: wait_for_service uri name
18+
wait_for_service() {
19+
flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\"
20+
}
21+
22+
test_expect_success 'broker fails with bad broker.boot-server' '
23+
test_must_fail flux broker \
24+
-Sbroker.rc1_path= -Sbroker.rc3_path= \
25+
-Sbroker.boot-server=local://noexist/path \
26+
/bin/true 2>server.err &&
27+
grep "was not found" server.err
28+
'
29+
30+
test_expect_success 'start a 1 node job with 0 extra ranks' '
31+
id=$(flux batch -N1 --wrap sleep inf) &&
32+
get_job_uri $id >test1.uri
33+
'
34+
test_expect_success 'job has size 1' '
35+
size=$(flux proxy $(cat test1.uri) flux getattr size) &&
36+
test $size -eq 1
37+
'
38+
test_expect_success 'flub bootstrap fails with no available ranks' '
39+
test_must_fail flux broker \
40+
-Sbroker.boot-server=$(cat test1.uri) 2>noranks.err &&
41+
grep "no available ranks" noranks.err
42+
'
43+
test_expect_success 'clean up' '
44+
flux cancel --all
45+
'
46+
47+
48+
#
49+
# Start 2 node batch job with one extra slot.
50+
# Submit 1 node broker job that fills the slot.
51+
# Run a parallel job across all three nodes in the batch job.
52+
# This test is constrained so that all flubbed nodes are leaf nodes,
53+
# and the flubbed nodes connect to rank 0 only.
54+
55+
test_expect_success 'create config with fake resources' '
56+
cat >fake2.toml <<-EOT
57+
[resource]
58+
noverify = true
59+
[[resource.config]]
60+
hosts = "a,b,c"
61+
cores = "0-3"
62+
EOT
63+
'
64+
test_expect_success 'start a 2 node job with 1 extra rank' '
65+
id=$(flux batch -N2 \
66+
--broker-opts=--config-path=fake2.toml \
67+
--broker-opts=-Ssize=3 \
68+
--broker-opts=-Sbroker.quorum=2 \
69+
--broker-opts=-Stbon.topo=kary:0 \
70+
--wrap sleep inf) &&
71+
get_job_uri $id >test2.uri
72+
'
73+
test_expect_success 'job has size 3' '
74+
size=$(flux proxy $(cat test2.uri) flux getattr size) &&
75+
test $size -eq 3
76+
'
77+
test_expect_success 'overlay status shows extra node offline' '
78+
flux proxy $(cat test2.uri) \
79+
flux overlay status --no-pretty >ov2.out &&
80+
grep "2 extra0: offline" ov2.out
81+
'
82+
test_expect_success 'run a 2 node job in the initial instance' '
83+
wait_for_service $(cat test2.uri) job-ingest &&
84+
run_timeout 30 flux proxy $(cat test2.uri) \
85+
flux run --label-io -N2 flux pmi barrier
86+
'
87+
test_expect_success 'submit a job that starts 1 extra broker' '
88+
id=$(flux submit -N1 flux broker \
89+
--config-path=fake2.toml \
90+
-Stbon.topo=kary:0 \
91+
-Sbroker.boot-server=$(cat test2.uri)) &&
92+
flux job wait-event -p guest.exec.eventlog $id shell.start
93+
'
94+
test_expect_success 'wait for overlay status to be full' '
95+
flux proxy $(cat test2.uri) \
96+
flux overlay status --wait full --timeout 10s
97+
'
98+
test_expect_success 'run a 3 node job in the expanded instance' '
99+
run_timeout 30 flux proxy $(cat test2.uri) \
100+
flux run --label-io -N3 flux pmi barrier
101+
'
102+
test_expect_success 'clean up' '
103+
flux cancel --all
104+
'
105+
106+
#
107+
# Start 3 node batch job with four extra slots (kary:2).
108+
# Submit 4 node broker job that fills the slots.
109+
# Run a parallel job across all seven nodes in the batch job.
110+
# This test is constrained so that all flubbed nodes are leaf nodes,
111+
# but they are grafted on different nodes depending on topology.
112+
# 0
113+
# 1 2
114+
# 3 4 5 6 <-- flubbed
115+
116+
test_expect_success 'create config with fake resources' '
117+
cat >fake3.toml <<-EOT
118+
[resource]
119+
noverify = true
120+
[[resource.config]]
121+
hosts = "a,b,c,d,e,f,g"
122+
cores = "0-3"
123+
EOT
124+
'
125+
test_expect_success 'start a 3 node job with 4 extra ranks' '
126+
id=$(flux batch -N3 \
127+
--broker-opts=--config-path=fake3.toml \
128+
--broker-opts=-Ssize=7 \
129+
--broker-opts=-Sbroker.quorum=3 \
130+
--broker-opts=-Stbon.topo=kary:2 \
131+
--wrap sleep inf) &&
132+
get_job_uri $id >test3.uri
133+
'
134+
test_expect_success 'job has size 7' '
135+
size=$(flux proxy $(cat test3.uri) flux getattr size) &&
136+
test $size -eq 7
137+
'
138+
test_expect_success 'run a 3 node job in the initial instance' '
139+
wait_for_service $(cat test3.uri) job-ingest &&
140+
run_timeout 30 flux proxy $(cat test3.uri) \
141+
flux run --label-io -N3 flux pmi barrier
142+
'
143+
test_expect_success 'submit a job that starts 4 extra brokers' '
144+
id=$(flux submit -N4 flux broker \
145+
--config-path=fake3.toml \
146+
-Stbon.topo=kary:2 \
147+
-Sbroker.boot-server=$(cat test3.uri)) &&
148+
flux job wait-event -p guest.exec.eventlog $id shell.start
149+
'
150+
test_expect_success 'wait for overlay status to be full' '
151+
flux proxy $(cat test3.uri) \
152+
flux overlay status --wait full --timeout 10s
153+
'
154+
test_expect_success 'run a 7 node job in the expanded instance' '
155+
run_timeout 30 flux proxy $(cat test3.uri) \
156+
flux run --label-io -N7 flux pmi barrier
157+
'
158+
test_expect_success 'clean up' '
159+
flux cancel --all
160+
'
161+
162+
#
163+
# Start 1 node batch job with 6 extra slots (kary:2).
164+
# Submit 2 node broker job that fills the first level slots.
165+
# Run a 3 node parallel job.
166+
# Submit 4 node broker job that fills the second level slots.
167+
# Run a 7 node parallel job.
168+
# 0
169+
# 1 2 <-- flubbed (phase 1)
170+
# 3 4 5 6 <-- flubbed (phase 2)
171+
# This test is constrained so the first level wires up before
172+
# the second level is started.
173+
174+
test_expect_success 'start a 1 node job with 6 extra ranks' '
175+
id=$(flux batch -N1 \
176+
--broker-opts=--config-path=fake3.toml \
177+
--broker-opts=-Ssize=7 \
178+
--broker-opts=-Sbroker.quorum=1 \
179+
--broker-opts=-Stbon.topo=kary:2 \
180+
--wrap sleep inf) &&
181+
get_job_uri $id >test4.uri
182+
'
183+
test_expect_success 'run a 1 node job in the initial instance' '
184+
wait_for_service $(cat test4.uri) job-ingest &&
185+
run_timeout 30 flux proxy $(cat test4.uri) \
186+
flux run --label-io -N1 flux pmi barrier
187+
'
188+
test_expect_success 'job has size 7' '
189+
size=$(flux proxy $(cat test4.uri) flux getattr size) &&
190+
test $size -eq 7
191+
'
192+
test_expect_success 'submit a job that starts 2 extra brokers' '
193+
id=$(flux submit -N2 flux broker \
194+
--config-path=fake3.toml \
195+
-Stbon.topo=kary:2 \
196+
-Sbroker.boot-server=$(cat test4.uri)) &&
197+
flux job wait-event -p guest.exec.eventlog $id shell.start
198+
'
199+
test_expect_success 'run a 3 node job in the expanded instance' '
200+
run_timeout 30 flux proxy $(cat test4.uri) \
201+
flux run --label-io -N3 flux pmi barrier
202+
'
203+
test_expect_success 'submit a job that starts 4 extra brokers' '
204+
id=$(flux submit -N4 flux broker \
205+
--config-path=fake3.toml \
206+
-Stbon.topo=kary:2 \
207+
-Sbroker.boot-server=$(cat test4.uri)) &&
208+
flux job wait-event -p guest.exec.eventlog $id shell.start
209+
'
210+
test_expect_success 'wait for overlay status to be full' '
211+
flux proxy $(cat test4.uri) \
212+
flux overlay status --wait full --timeout 10s
213+
'
214+
test_expect_success 'run a 7 node job in the expanded instance' '
215+
run_timeout 30 flux proxy $(cat test4.uri) \
216+
flux run --label-io -N7 flux pmi barrier
217+
'
218+
test_expect_success 'clean up' '
219+
flux cancel --all
220+
'
221+
222+
#
223+
# Start 1 node batch job with 6 extra slots (kary:2).
224+
# Submit 6 node broker job that fills all the slots.
225+
# Run a 7 node parallel job.
226+
#
227+
test_expect_success 'start a 1 node job with 6 extra ranks' '
228+
id=$(flux batch -N1 \
229+
--broker-opts=--config-path=fake3.toml \
230+
--broker-opts=-Ssize=7 \
231+
--broker-opts=-Sbroker.quorum=1 \
232+
--broker-opts=-Stbon.topo=kary:2 \
233+
--wrap sleep inf) &&
234+
get_job_uri $id >test5.uri
235+
'
236+
test_expect_success 'run a 1 node job in the initial instance' '
237+
wait_for_service $(cat test5.uri) job-ingest &&
238+
run_timeout 30 flux proxy $(cat test5.uri) \
239+
flux run --label-io -N1 flux pmi barrier
240+
'
241+
test_expect_success 'job has size 7' '
242+
size=$(flux proxy $(cat test5.uri) flux getattr size) &&
243+
test $size -eq 7
244+
'
245+
test_expect_success 'submit a job that starts 6 extra brokers' '
246+
id=$(flux submit -N6 flux broker \
247+
--config-path=fake3.toml \
248+
-Stbon.topo=kary:2 \
249+
-Sbroker.boot-server=$(cat test5.uri)) &&
250+
flux job wait-event -p guest.exec.eventlog $id shell.start
251+
'
252+
test_expect_success 'wait for overlay status to be full' '
253+
flux proxy $(cat test5.uri) \
254+
flux overlay status --wait full --timeout 10s
255+
'
256+
test_expect_success 'run a 7 node job in the expanded instance' '
257+
run_timeout 30 flux proxy $(cat test5.uri) \
258+
flux run --label-io -N7 flux pmi barrier
259+
'
260+
test_expect_success 'clean up' '
261+
flux cancel --all
262+
'
263+
264+
test_done

0 commit comments

Comments
 (0)