|
| 1 | +#!/bin/sh |
| 2 | +# |
| 3 | + |
| 4 | +test_description='Test flub bootstrap method' |
| 5 | + |
| 6 | +. `dirname $0`/sharness.sh |
| 7 | + |
| 8 | +test_under_flux 8 full |
| 9 | + |
| 10 | +export FLUX_URI_RESOLVE_LOCAL=t |
| 11 | + |
| 12 | +# usage: get_job_uri id |
| 13 | +get_job_uri() { |
| 14 | + flux job wait-event -t10 $1 memo >/dev/null && flux uri $1 |
| 15 | +} |
| 16 | + |
| 17 | +# usage: wait_for_service uri name |
| 18 | +wait_for_service() { |
| 19 | + flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\" |
| 20 | +} |
| 21 | + |
| 22 | +test_expect_success 'broker fails with bad broker.boot-server' ' |
| 23 | + test_must_fail flux broker \ |
| 24 | + -Sbroker.rc1_path= -Sbroker.rc3_path= \ |
| 25 | + -Sbroker.boot-server=local://noexist/path \ |
| 26 | + /bin/true 2>server.err && |
| 27 | + grep "was not found" server.err |
| 28 | +' |
| 29 | + |
| 30 | +test_expect_success 'start a 1 node job with 0 extra ranks' ' |
| 31 | + id=$(flux batch -N1 --wrap sleep inf) && |
| 32 | + get_job_uri $id >test1.uri |
| 33 | +' |
| 34 | +test_expect_success 'job has size 1' ' |
| 35 | + size=$(flux proxy $(cat test1.uri) flux getattr size) && |
| 36 | + test $size -eq 1 |
| 37 | +' |
| 38 | +test_expect_success 'flub bootstrap fails with no available ranks' ' |
| 39 | + test_must_fail flux broker \ |
| 40 | + -Sbroker.boot-server=$(cat test1.uri) 2>noranks.err && |
| 41 | + grep "no available ranks" noranks.err |
| 42 | +' |
| 43 | +test_expect_success 'clean up' ' |
| 44 | + flux cancel --all |
| 45 | +' |
| 46 | + |
| 47 | + |
| 48 | +# |
| 49 | +# Start 2 node batch job with one extra slot. |
| 50 | +# Submit 1 node broker job that fills the slot. |
| 51 | +# Run a parallel job across all three nodes in the batch job. |
| 52 | +# This test is constrained so that all flubbed nodes are leaf nodes, |
| 53 | +# and the flubbed nodes connect to rank 0 only. |
| 54 | + |
| 55 | +test_expect_success 'create config with 3 fake nodes' ' |
| 56 | + cat >fake3.toml <<-EOT |
| 57 | + [resource] |
| 58 | + noverify = true |
| 59 | + [[resource.config]] |
| 60 | + hosts = "a,b,c" |
| 61 | + cores = "0-3" |
| 62 | + EOT |
| 63 | +' |
| 64 | +test_expect_success 'start a 2 node job with 1 extra rank' ' |
| 65 | + id=$(flux batch -N2 \ |
| 66 | + --broker-opts=--config-path=fake3.toml \ |
| 67 | + --broker-opts=-Ssize=3 \ |
| 68 | + --broker-opts=-Sbroker.quorum=2 \ |
| 69 | + --broker-opts=-Stbon.topo=kary:0 \ |
| 70 | + --wrap sleep inf) && |
| 71 | + get_job_uri $id >test2.uri |
| 72 | +' |
| 73 | +test_expect_success 'job has size 3' ' |
| 74 | + size=$(flux proxy $(cat test2.uri) flux getattr size) && |
| 75 | + test $size -eq 3 |
| 76 | +' |
| 77 | +test_expect_success 'overlay status shows extra node offline' ' |
| 78 | + flux proxy $(cat test2.uri) \ |
| 79 | + flux overlay status --no-pretty >ov2.out && |
| 80 | + grep "2 extra0: offline" ov2.out |
| 81 | +' |
| 82 | +test_expect_success 'run a 2 node job in the initial instance' ' |
| 83 | + wait_for_service $(cat test2.uri) job-ingest && |
| 84 | + run_timeout 30 flux proxy $(cat test2.uri) \ |
| 85 | + flux run --label-io -N2 flux pmi barrier |
| 86 | +' |
| 87 | +test_expect_success 'submit a job that starts 1 extra broker' ' |
| 88 | + id=$(flux submit -N1 flux broker \ |
| 89 | + --config-path=fake3.toml \ |
| 90 | + -Stbon.topo=kary:0 \ |
| 91 | + -Sbroker.boot-server=$(cat test2.uri)) && |
| 92 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 93 | +' |
| 94 | +test_expect_success 'wait for overlay status to be full' ' |
| 95 | + flux proxy $(cat test2.uri) \ |
| 96 | + flux overlay status --summary --wait full --timeout 10s |
| 97 | +' |
| 98 | +test_expect_success 'run a 3 node job in the expanded instance' ' |
| 99 | + run_timeout 30 flux proxy $(cat test2.uri) \ |
| 100 | + flux run --label-io -N3 flux pmi barrier |
| 101 | +' |
| 102 | +test_expect_success 'clean up' ' |
| 103 | + flux cancel --all |
| 104 | +' |
| 105 | + |
| 106 | +test_expect_success 'create config with 7 fake nodes' ' |
| 107 | + cat >fake7.toml <<-EOT |
| 108 | + [resource] |
| 109 | + noverify = true |
| 110 | + [[resource.config]] |
| 111 | + hosts = "a,b,c,d,e,f,g" |
| 112 | + cores = "0-3" |
| 113 | + EOT |
| 114 | +' |
| 115 | + |
| 116 | +# |
| 117 | +# Start 1 node batch job with 6 extra slots (kary:2). |
| 118 | +# Submit 6 node broker job that fills all the slots. |
| 119 | +# Run a 7 node parallel job. |
| 120 | +# |
| 121 | +test_expect_success 'start a 1 node job with 6 extra ranks' ' |
| 122 | + id=$(flux batch -N1 \ |
| 123 | + --broker-opts=--config-path=fake7.toml \ |
| 124 | + --broker-opts=-Ssize=7 \ |
| 125 | + --broker-opts=-Sbroker.quorum=1 \ |
| 126 | + --broker-opts=-Stbon.topo=kary:2 \ |
| 127 | + --wrap sleep inf) && |
| 128 | + get_job_uri $id >test5.uri |
| 129 | +' |
| 130 | +test_expect_success 'run a 1 node job in the initial instance' ' |
| 131 | + wait_for_service $(cat test5.uri) job-ingest && |
| 132 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 133 | + flux run --label-io -N1 flux pmi barrier |
| 134 | +' |
| 135 | +test_expect_success 'job has size 7' ' |
| 136 | + size=$(flux proxy $(cat test5.uri) flux getattr size) && |
| 137 | + test $size -eq 7 |
| 138 | +' |
| 139 | +# N.B. include exit-timeout=none so we can safely disconnect one node later |
| 140 | +test_expect_success 'submit a job that starts 6 extra brokers' ' |
| 141 | + id=$(flux submit -N6 -o exit-timeout=none \ |
| 142 | + flux broker \ |
| 143 | + --config-path=fake7.toml \ |
| 144 | + -Stbon.topo=kary:2 \ |
| 145 | + -Sbroker.boot-server=$(cat test5.uri)) && |
| 146 | + flux job wait-event -p guest.exec.eventlog $id shell.start && |
| 147 | + echo $id >xtra_id |
| 148 | +' |
| 149 | +test_expect_success 'wait for overlay status to be full' ' |
| 150 | + flux proxy $(cat test5.uri) \ |
| 151 | + flux overlay status --summary --wait full --timeout 10s |
| 152 | +' |
| 153 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 154 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 155 | + flux run --label-io -N7 flux pmi barrier |
| 156 | +' |
| 157 | + |
| 158 | +# |
| 159 | +# Show that a node can be replaced |
| 160 | + |
| 161 | +test_expect_success 'disconnect rank 6' ' |
| 162 | + flux proxy $(cat test5.uri) \ |
| 163 | + flux overlay disconnect 6 |
| 164 | +' |
| 165 | +test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' ' |
| 166 | + test_must_fail flux proxy $(cat test5.uri) \ |
| 167 | + flux ping -c1 6 |
| 168 | +' |
| 169 | +test_expect_success 'wait for overlay status to be degraded' ' |
| 170 | + flux proxy $(cat test5.uri) \ |
| 171 | + flux overlay status --summary --wait degraded --timeout 10s |
| 172 | +' |
| 173 | +test_expect_success 'submit a job that starts 1 broker' ' |
| 174 | + id=$(flux submit -N1 flux broker \ |
| 175 | + --config-path=fake7.toml \ |
| 176 | + -Stbon.topo=kary:2 \ |
| 177 | + -Sbroker.boot-server=$(cat test5.uri)) && |
| 178 | + flux job wait-event -p guest.exec.eventlog $id shell.start |
| 179 | +' |
| 180 | +test_expect_success 'wait for overlay status to be full' ' |
| 181 | + flux proxy $(cat test5.uri) \ |
| 182 | + flux overlay status --summary --wait full --timeout 10s |
| 183 | +' |
| 184 | +test_expect_success 'run a 7 node job in the expanded instance' ' |
| 185 | + run_timeout 30 flux proxy $(cat test5.uri) \ |
| 186 | + flux run --label-io -N7 flux pmi barrier |
| 187 | +' |
| 188 | + |
| 189 | +test_expect_success 'clean up' ' |
| 190 | + flux cancel --all |
| 191 | +' |
| 192 | + |
| 193 | +test_done |
0 commit comments