Skip to content

Commit c3e68d4

Browse files
committed
testsuite: cover flub bootstrap
Problem: there is no test coverage for adding brokers to a flux instance. Add some tests.
1 parent a70f717 commit c3e68d4

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ TESTSCRIPTS = \
9090
t0023-jobspec1-validate.t \
9191
t0026-flux-R.t \
9292
t0033-size-override.t \
93+
t0034-flub.t \
9394
t1000-kvs.t \
9495
t1001-kvs-internals.t \
9596
t1003-kvs-stress.t \

t/t0034-flub.t

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#!/bin/sh
2+
#
3+
4+
test_description='Test flub bootstrap method'
5+
6+
. `dirname $0`/sharness.sh
7+
8+
test_under_flux 8 full
9+
10+
# usage: get_job_uri id
11+
get_job_uri() {
12+
flux job wait-event -t10 $1 memo >/dev/null && flux uri --local $1
13+
}
14+
15+
# usage: wait_for_service uri name
16+
wait_for_service() {
17+
flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\"
18+
}
19+
20+
test_expect_success 'broker fails with bad broker.boot-server' '
21+
test_must_fail flux broker \
22+
-Sbroker.rc1_path= -Sbroker.rc3_path= \
23+
-Sbroker.boot-server=local://noexist/path \
24+
/bin/true 2>server.err &&
25+
grep "was not found" server.err
26+
'
27+
28+
test_expect_success 'start a 1 node job with 0 extra ranks' '
29+
id=$(flux batch -N1 --wrap sleep inf) &&
30+
get_job_uri $id >test1.uri
31+
'
32+
test_expect_success 'job has size 1' '
33+
size=$(flux proxy $(cat test1.uri) flux getattr size) &&
34+
test $size -eq 1
35+
'
36+
test_expect_success 'flub bootstrap fails with no available ranks' '
37+
test_must_fail flux broker \
38+
-Sbroker.boot-server=$(cat test1.uri) 2>noranks.err &&
39+
grep "no available ranks" noranks.err
40+
'
41+
test_expect_success 'clean up' '
42+
flux cancel --all
43+
'
44+
45+
46+
#
47+
# Start 2 node batch job with one extra slot.
48+
# Submit 1 node broker job that fills the slot.
49+
# Run a parallel job across all three nodes in the batch job.
50+
# This test is constrained so that all flubbed nodes are leaf nodes,
51+
# and the flubbed nodes connect to rank 0 only.
52+
53+
test_expect_success 'create config with 3 fake nodes' '
54+
cat >fake3.toml <<-EOT
55+
[resource]
56+
noverify = true
57+
[[resource.config]]
58+
hosts = "a,b,c"
59+
cores = "0-3"
60+
EOT
61+
'
62+
test_expect_success 'start a 2 node job with 1 extra rank' '
63+
id=$(flux batch -N2 \
64+
--broker-opts=--config-path=fake3.toml \
65+
--broker-opts=-Ssize=3 \
66+
--broker-opts=-Sbroker.quorum=2 \
67+
--broker-opts=-Stbon.topo=kary:0 \
68+
--wrap sleep inf) &&
69+
get_job_uri $id >test2.uri
70+
'
71+
test_expect_success 'job has size 3' '
72+
size=$(flux proxy $(cat test2.uri) flux getattr size) &&
73+
test $size -eq 3
74+
'
75+
test_expect_success 'overlay status shows extra node offline' '
76+
flux proxy $(cat test2.uri) \
77+
flux overlay status --no-pretty >ov2.out &&
78+
grep "2 extra0: offline" ov2.out
79+
'
80+
test_expect_success 'run a 2 node job in the initial instance' '
81+
wait_for_service $(cat test2.uri) job-ingest &&
82+
run_timeout 30 flux proxy $(cat test2.uri) \
83+
flux run --label-io -N2 flux pmi barrier
84+
'
85+
test_expect_success 'submit a job that starts 1 extra broker' '
86+
id=$(flux submit -N1 flux broker \
87+
--config-path=fake3.toml \
88+
-Stbon.topo=kary:0 \
89+
-Sbroker.boot-server=$(cat test2.uri)) &&
90+
flux job wait-event -p guest.exec.eventlog $id shell.start
91+
'
92+
test_expect_success 'wait for overlay status to be full' '
93+
flux proxy $(cat test2.uri) \
94+
flux overlay status --summary --wait full --timeout 10s
95+
'
96+
test_expect_success 'run a 3 node job in the expanded instance' '
97+
run_timeout 30 flux proxy $(cat test2.uri) \
98+
flux run --label-io -N3 flux pmi barrier
99+
'
100+
test_expect_success 'clean up' '
101+
flux cancel --all
102+
'
103+
104+
test_expect_success 'create config with 7 fake nodes' '
105+
cat >fake7.toml <<-EOT
106+
[resource]
107+
noverify = true
108+
[[resource.config]]
109+
hosts = "a,b,c,d,e,f,g"
110+
cores = "0-3"
111+
EOT
112+
'
113+
114+
#
115+
# Start 1 node batch job with 6 extra slots (kary:2).
116+
# Submit 6 node broker job that fills all the slots.
117+
# Run a 7 node parallel job.
118+
#
119+
test_expect_success 'start a 1 node job with 6 extra ranks' '
120+
id=$(flux batch -N1 \
121+
--broker-opts=--config-path=fake7.toml \
122+
--broker-opts=-Ssize=7 \
123+
--broker-opts=-Sbroker.quorum=1 \
124+
--broker-opts=-Stbon.topo=kary:2 \
125+
--wrap sleep inf) &&
126+
get_job_uri $id >test5.uri
127+
'
128+
test_expect_success 'run a 1 node job in the initial instance' '
129+
wait_for_service $(cat test5.uri) job-ingest &&
130+
run_timeout 30 flux proxy $(cat test5.uri) \
131+
flux run --label-io -N1 flux pmi barrier
132+
'
133+
test_expect_success 'job has size 7' '
134+
size=$(flux proxy $(cat test5.uri) flux getattr size) &&
135+
test $size -eq 7
136+
'
137+
# N.B. include exit-timeout=none so we can safely disconnect one node later
138+
test_expect_success 'submit a job that starts 6 extra brokers' '
139+
id=$(flux submit -N6 -o exit-timeout=none \
140+
flux broker \
141+
--config-path=fake7.toml \
142+
-Stbon.topo=kary:2 \
143+
-Sbroker.boot-server=$(cat test5.uri)) &&
144+
flux job wait-event -p guest.exec.eventlog $id shell.start &&
145+
echo $id >xtra_id
146+
'
147+
test_expect_success 'wait for overlay status to be full' '
148+
flux proxy $(cat test5.uri) \
149+
flux overlay status --summary --wait full --timeout 10s
150+
'
151+
test_expect_success 'run a 7 node job in the expanded instance' '
152+
run_timeout 30 flux proxy $(cat test5.uri) \
153+
flux run --label-io -N7 flux pmi barrier
154+
'
155+
156+
#
157+
# Show that a node can be replaced
158+
159+
test_expect_success 'disconnect rank 6' '
160+
flux proxy $(cat test5.uri) \
161+
flux overlay disconnect 6
162+
'
163+
test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' '
164+
test_must_fail flux proxy $(cat test5.uri) \
165+
flux ping -c1 6
166+
'
167+
test_expect_success 'wait for overlay status to be degraded' '
168+
flux proxy $(cat test5.uri) \
169+
flux overlay status --summary --wait degraded --timeout 10s
170+
'
171+
test_expect_success 'submit a job that starts 1 broker' '
172+
id=$(flux submit -N1 flux broker \
173+
--config-path=fake7.toml \
174+
-Stbon.topo=kary:2 \
175+
-Sbroker.boot-server=$(cat test5.uri)) &&
176+
flux job wait-event -p guest.exec.eventlog $id shell.start
177+
'
178+
test_expect_success 'wait for overlay status to be full' '
179+
flux proxy $(cat test5.uri) \
180+
flux overlay status --summary --wait full --timeout 10s
181+
'
182+
test_expect_success 'run a 7 node job in the expanded instance' '
183+
run_timeout 30 flux proxy $(cat test5.uri) \
184+
flux run --label-io -N7 flux pmi barrier
185+
'
186+
187+
test_expect_success 'clean up' '
188+
flux cancel --all
189+
'
190+
191+
test_done

0 commit comments

Comments
 (0)