Skip to content

Commit 33e7a7b

Browse files
garlickmilroy
authored andcommitted
testsuite: add test for reloading with rv1_nosched
Problem: a few specific use cases related to reloading fluxion when rv1_nosched is set are not fully covered by existing tests. Check the following - jobs are not killed when modules are reloaded (no queues configured) - jobs are not killed when modules are reloaded (queues configured) - no resources are double booked in those cases - jobs submitted to the anon queue are killed when the scheduler is reloaded with queues configured - jobs submitted to a named queue are killed when the scheduler is reloaded without queues configured
1 parent 2b92949 commit 33e7a7b

File tree

3 files changed

+141
-0
lines changed

3 files changed

+141
-0
lines changed

t/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ set(ALL_TESTS
2727
t1022-property-constraints.t
2828
t1023-multiqueue-constraints.t
2929
t1024-alloc-check.t
30+
t1025-rv1-reload.t
3031
t3000-jobspec.t
3132
t3001-resource-basic.t
3233
t3002-resource-prefix.t

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ TESTS = \
4545
t1022-property-constraints.t \
4646
t1023-multiqueue-constraints.t \
4747
t1024-alloc-check.t \
48+
t1025-rv1-reload.t \
4849
t3000-jobspec.t \
4950
t3001-resource-basic.t \
5051
t3002-resource-prefix.t \

t/t1025-rv1-reload.t

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/bin/sh
2+
#
3+
4+
test_description='Check that fluxion (rv1_nosched) does not kill jobs on reload'
5+
6+
. `dirname $0`/sharness.sh
7+
8+
export FLUX_SCHED_MODULE=none
9+
test_under_flux 1
10+
11+
test_expect_success 'configure fluxion with rv1_nosched' '
12+
cat >config.toml <<-EOT &&
13+
[sched-fluxion-resource]
14+
match-format = "rv1_nosched"
15+
EOT
16+
flux config load config.toml
17+
'
18+
test_expect_success 'add testqueue property to rank 0 (for later)' '
19+
flux resource R | flux R set-property testqueue:0 >R &&
20+
flux resource reload R
21+
'
22+
# N.B. double booked jobs get a fatal exception on alloc
23+
test_expect_success 'prepare to detect double booked resources' '
24+
flux jobtap load alloc-check.so
25+
'
26+
test_expect_success 'load fluxion modules' '
27+
load_resource &&
28+
load_qmanager_sync
29+
'
30+
#
31+
# Ensure jobs keep running across scheduler reload (no queues)
32+
#
33+
test_expect_success 'submit a sleep inf job and wait for alloc' '
34+
flux submit -n1 --flags=debug --wait-event=alloc sleep inf >job.id
35+
'
36+
test_expect_success 'reload fluxion modules' '
37+
remove_qmanager &&
38+
reload_resource &&
39+
load_qmanager
40+
'
41+
test_expect_success 'the job is still running' '
42+
state=$(flux jobs -n -o {state} $(cat <job.id)) &&
43+
test $state = RUN
44+
'
45+
test_expect_success 'run a pile of jobs to check for double booking' '
46+
flux submit --wait --cc=1-32 -n1 true
47+
'
48+
test_expect_success 'cancel the original job and wait for it to be inactive' '
49+
flux cancel $(cat <job.id) &&
50+
flux job wait-event $(cat <job.id) clean
51+
'
52+
#
53+
# Ensure jobs keep running across scheduler reload (with one queue)
54+
#
55+
test_expect_success 'configure testqueue' '
56+
cat >config2.toml <<-EOT &&
57+
[sched-fluxion-resource]
58+
match-format = "rv1_nosched"
59+
[queues.testqueue]
60+
requires = ["testqueue"]
61+
EOT
62+
flux config load config2.toml
63+
'
64+
test_expect_success 'reload fluxion modules to get new queue config' '
65+
remove_qmanager &&
66+
reload_resource &&
67+
load_qmanager
68+
'
69+
test_expect_success 'start testqueue' '
70+
flux queue start -q testqueue
71+
'
72+
test_expect_success 'submit a sleep inf job to testqueue and wait for alloc' '
73+
flux submit -vv --wait-event=alloc -n1 -q testqueue sleep inf >job2.id
74+
'
75+
test_expect_success 'reload fluxion modules' '
76+
remove_qmanager &&
77+
reload_resource &&
78+
load_qmanager
79+
'
80+
test_expect_success 'the job is still running' '
81+
state=$(flux jobs -n -o {state} $(cat <job2.id)) &&
82+
test $state = RUN
83+
'
84+
test_expect_success 'run a pile of jobs to check for double booking' '
85+
flux submit --wait --cc=1-32 -n1 -q testqueue true
86+
'
87+
#
88+
# A running job that was submitted to testqueue should get a fatal
89+
# exception when scheduler is reloaded with no queues configured.
90+
#
91+
# N.B. alloc-check won't accept a scheduler-restart exception in lieu of free
92+
# event so unload it to avoid false positives (flux-framework/flux-core#5889)
93+
test_expect_success 'unload alloc-check' '
94+
flux jobtap remove alloc-check.so
95+
'
96+
test_expect_success 'configure without queues' '
97+
flux config load config.toml
98+
'
99+
test_expect_success 'reload fluxion modules' '
100+
remove_qmanager &&
101+
reload_resource &&
102+
load_qmanager
103+
'
104+
test_expect_success 'running job received a fatal exception' '
105+
flux job wait-event -v -t 10s $(cat <job2.id) exception
106+
'
107+
test_expect_success 'wait for job to clean up' '
108+
flux job wait-event -v -t 10s $(cat <job2.id) clean
109+
'
110+
test_expect_success 'submit a sleep inf job to anon queue and wait for alloc' '
111+
flux submit -vv --wait-event=alloc -n1 sleep inf >job3.id
112+
'
113+
#
114+
# A running job that was submitted to the anon queue should get a fatal
115+
# exception when scheduler is reloaded with testqueue configured.
116+
#
117+
test_expect_success 'configure with testqueue' '
118+
flux config load config2.toml
119+
'
120+
test_expect_success 'reload fluxion modules' '
121+
remove_qmanager &&
122+
reload_resource &&
123+
load_qmanager
124+
'
125+
test_expect_success 'running job received a fatal exception' '
126+
flux job wait-event -v -t 10s $(cat <job3.id) exception
127+
'
128+
test_expect_success 'wait for job to clean up' '
129+
flux job wait-event -v -t 10s $(cat <job3.id) clean
130+
'
131+
test_expect_success 'clean up' '
132+
cleanup_active_jobs
133+
'
134+
test_expect_success 'remove fluxion modules' '
135+
remove_qmanager &&
136+
remove_resource
137+
'
138+
139+
test_done

0 commit comments

Comments
 (0)