|
| 1 | +#!/bin/sh |
| 2 | +# |
| 3 | + |
| 4 | +test_description='Test flub bootstrap method' |
| 5 | + |
| 6 | +. `dirname $0`/sharness.sh |
| 7 | + |
| 8 | +test_under_flux 8 full |
| 9 | + |
| 10 | +export FLUX_SSH="${SHARNESS_TEST_SRCDIR}/scripts/tssh" |
| 11 | + |
| 12 | +# usage: get_job_uri id |
| 13 | +get_job_uri() { |
| 14 | + flux job wait-event -t10 $1 memo >/dev/null && flux uri $1 |
| 15 | +} |
| 16 | + |
| 17 | +# usage: wait_for_service uri name |
| 18 | +wait_for_service() { |
| 19 | + flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\" |
| 20 | +} |
| 21 | + |
| 22 | +test_expect_success 'broker fails with bad broker.boot-server' ' |
| 23 | + test_must_fail flux broker \ |
| 24 | + -Sbroker.rc1_path= -Sbroker.rc3_path= \ |
| 25 | + -Sbroker.boot-server=local://noexist/path \ |
| 26 | + /bin/true 2>server.err && |
| 27 | + grep "was not found" server.err |
| 28 | +' |
| 29 | + |
| 30 | +test_expect_success 'start a 1 node job with 0 extra ranks' ' |
| 31 | + id=$(flux batch -N1 --wrap sleep inf) && |
| 32 | + get_job_uri $id >test1.uri |
| 33 | +' |
| 34 | +test_expect_success 'job has size 1' ' |
| 35 | + size=$(flux proxy $(cat test1.uri) flux getattr size) && |
| 36 | + test $size -eq 1 |
| 37 | +' |
| 38 | +test_expect_success 'flub bootstrap fails with no available ranks' ' |
| 39 | + test_must_fail flux broker \ |
| 40 | + -Sbroker.boot-server=$(cat test1.uri) 2>noranks.err && |
| 41 | + grep "no available ranks" noranks.err |
| 42 | +' |
| 43 | +test_expect_success 'clean up' ' |
| 44 | + flux cancel --all |
| 45 | +' |
| 46 | + |
| 47 | + |
| 48 | +# |
| 49 | +# Start 2 node batch job with one extra slot. |
| 50 | +# Submit 1 node broker job that fills the slot. |
| 51 | +# Run a parallel job across all three nodes in the batch job. |
| 52 | +# This test is constrained so that all flubbed nodes are leaf nodes, |
| 53 | +# and the flubbed nodes connect to rank 0 only. |
| 54 | + |
| 55 | +test_expect_success 'create config with fake resources' ' |
| 56 | + cat >fake2.toml <<-EOT |
| 57 | + [resource] |
| 58 | + noverify = true |
| 59 | + [[resource.config]] |
| 60 | + hosts = "a,b,c" |
| 61 | + cores = "0-3" |
| 62 | + EOT |
| 63 | +' |
| 64 | +test_expect_success 'start a 2 node job with 1 extra rank' ' |
| 65 | + id=$(flux batch -N2 \ |
| 66 | + --broker-opts=--config-path=fake2.toml \ |
| 67 | + --broker-opts=-Ssize=3 \ |
| 68 | + --broker-opts=-Sbroker.quorum=2 \ |
| 69 | + --broker-opts=-Stbon.topo=kary:0 \ |
| 70 | + --wrap sleep inf) && |
| 71 | + get_job_uri $id >test2.uri |
| 72 | +' |
| 73 | +test_expect_success 'job has size 3' ' |
| 74 | + size=$(flux proxy $(cat test2.uri) flux getattr size) && |
| 75 | + test $size -eq 3 |
| 76 | +' |
| 77 | +test_expect_success 'overlay status shows extra node offline' ' |
| 78 | + flux proxy $(cat test2.uri) \ |
| 79 | + flux overlay status --no-pretty >ov2.out && |
| 80 | + grep "2 extra0: offline" ov2.out |
| 81 | +' |
| 82 | +test_expect_success 'run a 2 node job in the initial instance' ' |
| 83 | + wait_for_service $(cat test2.uri) job-ingest && |
| 84 | + run_timeout 30 flux proxy $(cat test2.uri) \ |
| 85 | + flux run --label-io -N2 flux pmi barrier |
| 86 | +' |
| 87 | +test_expect_success 'submit a job that starts 1 extra broker' ' |
| 88 | + id=$(flux submit -N1 flux broker \ |
| 89 | + --config-path=fake2.toml \ |
| 90 | + -Stbon.topo=kary:0 \ |
| 91 | + -Sbroker.boot-server=$(cat test2.uri)) && |
| 92 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 93 | +' |
| 94 | +test_expect_success 'wait for overlay status to be full' ' |
| 95 | + flux proxy $(cat test2.uri) \ |
| 96 | + flux overlay status --summary --wait full --timeout 10s |
| 97 | +' |
| 98 | +test_expect_success 'run a 3 node job in the expanded instance' ' |
| 99 | + run_timeout 30 flux proxy $(cat test2.uri) \ |
| 100 | + flux run --label-io -N3 flux pmi barrier |
| 101 | +' |
| 102 | +test_expect_success 'clean up' ' |
| 103 | + flux cancel --all |
| 104 | +' |
| 105 | + |
| 106 | +# |
| 107 | +# Start 3 node batch job with four extra slots (kary:2). |
| 108 | +# Submit 4 node broker job that fills the slots. |
| 109 | +# Run a parallel job across all seven nodes in the batch job. |
| 110 | +# This test is constrained so that all flubbed nodes are leaf nodes, |
| 111 | +# but they are grafted on different nodes depending on topology. |
| 112 | +# 0 |
| 113 | +# 1 2 |
| 114 | +# 3 4 5 6 <-- flubbed |
| 115 | + |
| 116 | +test_expect_success 'create config with fake resources' ' |
| 117 | + cat >fake3.toml <<-EOT |
| 118 | + [resource] |
| 119 | + noverify = true |
| 120 | + [[resource.config]] |
| 121 | + hosts = "a,b,c,d,e,f,g" |
| 122 | + cores = "0-3" |
| 123 | + EOT |
| 124 | +' |
| 125 | +test_expect_success 'start a 3 node job with 4 extra ranks' ' |
| 126 | + id=$(flux batch -N3 \ |
| 127 | + --broker-opts=--config-path=fake3.toml \ |
| 128 | + --broker-opts=-Ssize=7 \ |
| 129 | + --broker-opts=-Sbroker.quorum=3 \ |
| 130 | + --broker-opts=-Stbon.topo=kary:2 \ |
| 131 | + --wrap sleep inf) && |
| 132 | + get_job_uri $id >test3.uri |
| 133 | +' |
| 134 | +test_expect_success 'job has size 7' ' |
| 135 | + size=$(flux proxy $(cat test3.uri) flux getattr size) && |
| 136 | + test $size -eq 7 |
| 137 | +' |
| 138 | +test_expect_success 'run a 3 node job in the initial instance' ' |
| 139 | + wait_for_service $(cat test3.uri) job-ingest && |
| 140 | + run_timeout 30 flux proxy $(cat test3.uri) \ |
| 141 | + flux run --label-io -N3 flux pmi barrier |
| 142 | +' |
| 143 | +test_expect_success 'submit a job that starts 4 extra brokers' ' |
| 144 | + id=$(flux submit -N4 flux broker \ |
| 145 | + --config-path=fake3.toml \ |
| 146 | + -Stbon.topo=kary:2 \ |
| 147 | + -Sbroker.boot-server=$(cat test3.uri)) && |
| 148 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 149 | +' |
| 150 | +test_expect_success 'wait for overlay status to be full' ' |
| 151 | + flux proxy $(cat test3.uri) \ |
| 152 | + flux overlay status --summary --wait full --timeout 10s |
| 153 | +' |
| 154 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 155 | + run_timeout 30 flux proxy $(cat test3.uri) \ |
| 156 | + flux run --label-io -N7 flux pmi barrier |
| 157 | +' |
| 158 | +test_expect_success 'clean up' ' |
| 159 | + flux cancel --all |
| 160 | +' |
| 161 | + |
| 162 | +# |
| 163 | +# Start 1 node batch job with 6 extra slots (kary:2). |
| 164 | +# Submit 2 node broker job that fills the first level slots. |
| 165 | +# Run a 3 node parallel job. |
| 166 | +# Submit 4 node broker job that fills the second level slots. |
| 167 | +# Run a 7 node parallel job. |
| 168 | +# 0 |
| 169 | +# 1 2 <-- flubbed (phase 1) |
| 170 | +# 3 4 5 6 <-- flubbed (phase 2) |
| 171 | +# This test is constrained so the first level wires up before |
| 172 | +# the second level is started. |
| 173 | + |
| 174 | +test_expect_success 'start a 1 node job with 6 extra ranks' ' |
| 175 | + id=$(flux batch -N1 \ |
| 176 | + --broker-opts=--config-path=fake3.toml \ |
| 177 | + --broker-opts=-Ssize=7 \ |
| 178 | + --broker-opts=-Sbroker.quorum=1 \ |
| 179 | + --broker-opts=-Stbon.topo=kary:2 \ |
| 180 | + --wrap sleep inf) && |
| 181 | + get_job_uri $id >test4.uri |
| 182 | +' |
| 183 | +test_expect_success 'run a 1 node job in the initial instance' ' |
| 184 | + wait_for_service $(cat test4.uri) job-ingest && |
| 185 | + run_timeout 30 flux proxy $(cat test4.uri) \ |
| 186 | + flux run --label-io -N1 flux pmi barrier |
| 187 | +' |
| 188 | +test_expect_success 'job has size 7' ' |
| 189 | + size=$(flux proxy $(cat test4.uri) flux getattr size) && |
| 190 | + test $size -eq 7 |
| 191 | +' |
| 192 | +test_expect_success 'submit a job that starts 2 extra brokers' ' |
| 193 | + id=$(flux submit -N2 flux broker \ |
| 194 | + --config-path=fake3.toml \ |
| 195 | + -Stbon.topo=kary:2 \ |
| 196 | + -Sbroker.boot-server=$(cat test4.uri)) && |
| 197 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 198 | +' |
| 199 | +test_expect_success 'run a 3 node job in the expanded instance' ' |
| 200 | + run_timeout 30 flux proxy $(cat test4.uri) \ |
| 201 | + flux run --label-io -N3 flux pmi barrier |
| 202 | +' |
| 203 | +test_expect_success 'submit a job that starts 4 extra brokers' ' |
| 204 | + id=$(flux submit -N4 flux broker \ |
| 205 | + --config-path=fake3.toml \ |
| 206 | + -Stbon.topo=kary:2 \ |
| 207 | + -Sbroker.boot-server=$(cat test4.uri)) && |
| 208 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 209 | +' |
| 210 | +test_expect_success 'wait for overlay status to be full' ' |
| 211 | + flux proxy $(cat test4.uri) \ |
| 212 | + flux overlay status --summary --wait full --timeout 10s |
| 213 | +' |
| 214 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 215 | + run_timeout 30 flux proxy $(cat test4.uri) \ |
| 216 | + flux run --label-io -N7 flux pmi barrier |
| 217 | +' |
| 218 | +test_expect_success 'clean up' ' |
| 219 | + flux cancel --all |
| 220 | +' |
| 221 | + |
| 222 | +# |
| 223 | +# Start 1 node batch job with 6 extra slots (kary:2). |
| 224 | +# Submit 6 node broker job that fills all the slots. |
| 225 | +# Run a 7 node parallel job. |
| 226 | +# |
| 227 | +test_expect_success 'start a 1 node job with 6 extra ranks' ' |
| 228 | + id=$(flux batch -N1 \ |
| 229 | + --broker-opts=--config-path=fake3.toml \ |
| 230 | + --broker-opts=-Ssize=7 \ |
| 231 | + --broker-opts=-Sbroker.quorum=1 \ |
| 232 | + --broker-opts=-Stbon.topo=kary:2 \ |
| 233 | + --wrap sleep inf) && |
| 234 | + get_job_uri $id >test5.uri |
| 235 | +' |
| 236 | +test_expect_success 'run a 1 node job in the initial instance' ' |
| 237 | + wait_for_service $(cat test5.uri) job-ingest && |
| 238 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 239 | + flux run --label-io -N1 flux pmi barrier |
| 240 | +' |
| 241 | +test_expect_success 'job has size 7' ' |
| 242 | + size=$(flux proxy $(cat test5.uri) flux getattr size) && |
| 243 | + test $size -eq 7 |
| 244 | +' |
| 245 | +test_expect_success 'submit a job that starts 6 extra brokers' ' |
| 246 | + id=$(flux submit -N6 -o exit-timeout=none \ |
| 247 | + flux broker \ |
| 248 | + --config-path=fake3.toml \ |
| 249 | + -Stbon.topo=kary:2 \ |
| 250 | + -Sbroker.boot-server=$(cat test5.uri)) && |
| 251 | + flux job wait-event -p guest.exec.eventlog $id shell.start && |
| 252 | + echo $id >xtra_id |
| 253 | +' |
| 254 | +test_expect_success 'wait for overlay status to be full' ' |
| 255 | + flux proxy $(cat test5.uri) \ |
| 256 | + flux overlay status --summary --wait full --timeout 10s |
| 257 | +' |
| 258 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 259 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 260 | + flux run --label-io -N7 flux pmi barrier |
| 261 | +' |
| 262 | + |
| 263 | +# |
| 264 | +# Show that a node can be replaced |
| 265 | + |
| 266 | +test_expect_success 'disconnect rank 6' ' |
| 267 | + flux proxy $(cat test5.uri) \ |
| 268 | + flux overlay disconnect 6 |
| 269 | +' |
| 270 | +test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' ' |
| 271 | + test_must_fail flux proxy $(cat test5.uri) \ |
| 272 | + flux ping -c1 6 |
| 273 | +' |
| 274 | +test_expect_success 'wait for overlay status to be degraded' ' |
| 275 | + flux proxy $(cat test5.uri) \ |
| 276 | + flux overlay status --summary --wait degraded --timeout 10s |
| 277 | +' |
| 278 | +test_expect_success 'submit a job that starts 1 broker' ' |
| 279 | + id=$(flux submit -N1 flux broker \ |
| 280 | + --config-path=fake3.toml \ |
| 281 | + -Stbon.topo=kary:2 \ |
| 282 | + -Sbroker.boot-server=$(cat test5.uri)) && |
| 283 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 284 | +' |
| 285 | +test_expect_success 'wait for overlay status to be full' ' |
| 286 | + flux proxy $(cat test5.uri) \ |
| 287 | + flux overlay status --summary --wait full --timeout 10s |
| 288 | +' |
| 289 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 290 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 291 | + flux run --label-io -N7 flux pmi barrier |
| 292 | +' |
| 293 | + |
| 294 | +test_expect_success 'clean up' ' |
| 295 | + flux cancel --all |
| 296 | +' |
| 297 | + |
| 298 | +test_done |
0 commit comments