Skip to content

Commit d3bc88f

Browse files
switch to ported slurm docker cluster (#1297)
* switch to ported slurm docker cluster * fix slurm config and new slurm image * split ci steps for better readability * fix ci names * fix job references * set default working directory * ci: docker compose: improve startup * update test script * only run test without config change * ci: fix logging and make slurm node privilged * apply format * ci: add extra slurm test run * add missing executor * register pytest mark * move marker to .py and print slurm logs * fix python read * remove unnecessary decode * add pytest_configuration type annotation * print seff output * add more prints * make seff optional * print properties * replace seff with sacct * detect out of memory using sacct * sacct print stdout * slurm conf constrain ram space * slurm job acct: disable over memory kill * switch cgroup v2 and ignore systemd * fix out of memory detection * restart slurm * remove pytest_configuration * apply format * retry gathering job information * fix linting errors * remove max parallel ci jobs * decrease sacct request frequencies * remove prints * remove prints 2 * use maste docker cluster * apply suggestions * fix lints and format * add Changelog entry * add missing setuptools entry * cluster_tools: remove setuptools entry * Update cluster_tools/tests/test_slurm.py Co-authored-by: Philipp Otto <[email protected]> --------- Co-authored-by: Philipp Otto <[email protected]>
1 parent a35f9ce commit d3bc88f

File tree

8 files changed

+270
-209
lines changed

8 files changed

+270
-209
lines changed

.github/workflows/ci.yml

Lines changed: 147 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -29,121 +29,169 @@ jobs:
2929
cluster_tools:
3030
- 'cluster_tools/**'
3131
32-
cluster_tools:
32+
cluster_tools_slurm:
3333
needs: changes
3434
if: ${{ needs.changes.outputs.cluster_tools == 'true' }}
3535
runs-on: ubuntu-latest
3636
timeout-minutes: 30
3737
strategy:
38-
max-parallel: 4
3938
matrix:
40-
executors: [multiprocessing, slurm, kubernetes, dask]
4139
python-version: ["3.13", "3.12", "3.11", "3.10"]
4240
defaults:
4341
run:
4442
working-directory: cluster_tools
4543
steps:
46-
- uses: actions/checkout@v3
47-
- name: Install uv
48-
uses: astral-sh/setup-uv@v3
44+
- uses: actions/checkout@v4
45+
- uses: astral-sh/setup-uv@v6
4946
with:
5047
version: "0.6.3"
5148
enable-cache: true
5249
cache-dependency-glob: "cluster_tools/uv.lock"
53-
54-
- name: Set up Python ${{ matrix.python-version }}
55-
run: uv python install ${{ matrix.python-version }}
56-
- name: Build/pull dockered-slurm image
57-
if: ${{ matrix.executors == 'slurm' }}
50+
- run: uv python install ${{ matrix.python-version }}
51+
- name: Start Docker Cluster
52+
run: cd ./dockered-slurm && docker compose up -d
53+
- name: Log Core Container
5854
run: |
59-
cd ./dockered-slurm
60-
61-
echo docker compose up
62-
docker compose up -d
63-
64-
# Register cluster (with retry)
65-
for i in {1..5}; do
66-
echo register_cluster
67-
./register_cluster.sh && s=0 && break || s=$?
68-
sleep 10
55+
for name in "slurmctld" "c1" "c2"; do
56+
docker logs "$name"
6957
done
70-
71-
# Show log output for debugging
72-
docker logs slurmctld
73-
docker logs c1
74-
docker logs c2
75-
76-
# Run setup.py on all three nodes
77-
docker exec -w /cluster_tools slurmctld bash -c "uv sync --frozen" &
78-
docker exec -w /cluster_tools c1 bash -c "uv sync --frozen" &
79-
docker exec -w /cluster_tools c2 bash -c "uv sync --frozen" &
80-
wait
81-
82-
- name: Setup Kubernetes-in-Docker
83-
if: ${{ matrix.executors == 'kubernetes' }}
58+
- name: Install UV dependencies
8459
run: |
85-
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64
86-
chmod +x ./kind
87-
sed -i "s#__PATH__#$(pwd)#g" tests/cluster-config.yaml
88-
./kind create cluster --config=tests/cluster-config.yaml
89-
./kind export kubeconfig
90-
91-
docker build \
92-
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
93-
-f tests/Dockerfile \
94-
-t scalableminds/cluster-tools:latest \
95-
.
96-
./kind load docker-image scalableminds/cluster-tools:latest
60+
for name in "slurmctld" "c1" "c2"; do
61+
docker exec -w /cluster_tools "$name" bash -c "uv sync --frozen"
62+
done
63+
- name: "Run Tests (test_all, test_slurm) without modified slurm.conf"
64+
run: |
65+
docker exec \
66+
-w /cluster_tools/tests \
67+
-e PYTEST_EXECUTORS=slurm \
68+
slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py -m 'not requires_modified_slurm_config'"
69+
- name: "Run Tests (test_deref_main)"
70+
run: |
71+
docker exec \
72+
-w /cluster_tools/tests \
73+
slurmctld bash -c "uv run --frozen python test_deref_main.py"
9774
98-
- name: Install dependencies (without docker)
99-
if: ${{ matrix.executors == 'multiprocessing' }}
75+
- name: Update Slurm Config
10076
run: |
101-
uv sync --frozen
77+
echo "MaxArraySize=2" >> ./dockered-slurm/slurm.conf
78+
sed "s/JobAcctGatherFrequency=30/JobAcctGatherFrequency=1/g" ./dockered-slurm/slurm.conf > ./dockered-slurm/slurm.conf.tmp
79+
mv ./dockered-slurm/slurm.conf.tmp ./dockered-slurm/slurm.conf
80+
- name: Restart Slurm Cluster
81+
run: cd ./dockered-slurm && docker compose restart slurmctld c1 c2
10282

103-
- name: Install dependencies (without docker)
104-
if: ${{ matrix.executors == 'kubernetes' || matrix.executors == 'dask' }}
83+
- name: "Run Tests (test_all, test_slurm) with modified slurn.conf"
10584
run: |
106-
uv sync --all-extras --frozen
85+
# Run tests requiring a modified slurm config
86+
docker exec \
87+
-w /cluster_tools/tests \
88+
-e PYTEST_EXECUTORS=slurm \
89+
slurmctld bash -c "uv run --frozen python -m pytest -sv test_slurm.py -m 'requires_modified_slurm_config'"
10790
91+
cluster_tools_multiprocessing:
92+
needs: changes
93+
if: ${{ needs.changes.outputs.cluster_tools == 'true' }}
94+
runs-on: ubuntu-latest
95+
timeout-minutes: 30
96+
strategy:
97+
matrix:
98+
python-version: ["3.13", "3.12", "3.11", "3.10"]
99+
defaults:
100+
run:
101+
working-directory: cluster_tools
102+
steps:
103+
- uses: actions/checkout@v4
104+
- name: Install uv
105+
uses: astral-sh/setup-uv@v6
106+
with:
107+
version: "0.6.3"
108+
enable-cache: true
109+
cache-dependency-glob: "cluster_tools/uv.lock"
110+
- name: Set up Python ${{ matrix.python-version }}
111+
run: uv python install ${{ matrix.python-version }}
112+
- name: Install dependencies (without docker)
113+
run: uv sync --frozen
108114
- name: Check typing
109-
if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }}
115+
if: ${{ matrix.python-version == '3.11' }}
110116
run: ./typecheck.sh
111-
112117
- name: Check formatting
113-
if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }}
118+
if: ${{ matrix.python-version == '3.11' }}
114119
run: ./format.sh check
115-
116120
- name: Lint code
117-
if: ${{ matrix.executors == 'multiprocessing' && matrix.python-version == '3.11' }}
121+
if: ${{ matrix.python-version == '3.11' }}
118122
run: ./lint.sh
119-
120123
- name: Run multiprocessing tests
121-
if: ${{ matrix.executors == 'multiprocessing' }}
122124
run: |
123125
cd tests
124126
PYTEST_EXECUTORS=multiprocessing,sequential,multiprocessing_with_pickling,sequential_with_pickling \
125127
uv run --frozen python -m pytest -sv test_all.py test_multiprocessing.py
126128
127-
- name: Run slurm tests
128-
if: ${{ matrix.executors == 'slurm' }}
129+
cluster_tools_kubernetes:
130+
needs: changes
131+
if: ${{ needs.changes.outputs.cluster_tools == 'true' }}
132+
runs-on: ubuntu-latest
133+
timeout-minutes: 30
134+
strategy:
135+
matrix:
136+
python-version: ["3.13", "3.12", "3.11", "3.10"]
137+
defaults:
138+
run:
139+
working-directory: cluster_tools
140+
steps:
141+
- uses: actions/checkout@v4
142+
- name: Install uv
143+
uses: astral-sh/setup-uv@v6
144+
with:
145+
version: "0.6.3"
146+
enable-cache: true
147+
cache-dependency-glob: "cluster_tools/uv.lock"
148+
- name: Set up Python ${{ matrix.python-version }}
149+
run: uv python install ${{ matrix.python-version }}
150+
- name: Setup Kubernetes-in-Docker
129151
run: |
130-
cd ./dockered-slurm
131-
docker exec \
132-
-w /cluster_tools/tests \
133-
-e PYTEST_EXECUTORS=slurm \
134-
slurmctld bash -c "uv run --frozen python -m pytest -sv test_all.py test_slurm.py"
135-
docker exec \
136-
-w /cluster_tools/tests \
137-
slurmctld bash -c "uv run --frozen python test_deref_main.py"
152+
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64
153+
chmod +x ./kind
154+
sed -i "s#__PATH__#$(pwd)#g" tests/cluster-config.yaml
155+
./kind create cluster --config=tests/cluster-config.yaml
156+
./kind export kubeconfig
138157
139-
- name: Run kubernetes tests
140-
if: ${{ matrix.executors == 'kubernetes' }}
158+
docker build \
159+
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
160+
-f tests/Dockerfile \
161+
-t scalableminds/cluster-tools:latest \
162+
.
163+
./kind load docker-image scalableminds/cluster-tools:latest
164+
- name: Install dependencies (without docker)
165+
run: uv sync --all-extras --frozen
166+
- name: "Run Kubernetes"
141167
run: |
142168
cd tests
143169
PYTEST_EXECUTORS=kubernetes uv run --frozen python -m pytest -sv test_all.py test_kubernetes.py
144170
145-
- name: Run dask tests
146-
if: ${{ matrix.executors == 'dask' }}
171+
cluster_tools_dask:
172+
needs: changes
173+
if: ${{ needs.changes.outputs.cluster_tools == 'true' }}
174+
runs-on: ubuntu-latest
175+
timeout-minutes: 30
176+
strategy:
177+
matrix:
178+
python-version: ["3.13", "3.12", "3.11", "3.10"]
179+
defaults:
180+
run:
181+
working-directory: cluster_tools
182+
steps:
183+
- uses: actions/checkout@v4
184+
- name: Install uv
185+
uses: astral-sh/setup-uv@v6
186+
with:
187+
version: "0.6.3"
188+
enable-cache: true
189+
cache-dependency-glob: "cluster_tools/uv.lock"
190+
- name: Set up Python ${{ matrix.python-version }}
191+
run: uv python install ${{ matrix.python-version }}
192+
- name: Install dependencies (without docker)
193+
run: uv sync --all-extras --frozen
194+
- name: "Run Dask"
147195
run: |
148196
cd tests
149197
PYTEST_EXECUTORS=dask uv run --frozen python -m pytest -sv test_all.py test_dask.py
@@ -155,9 +203,8 @@ jobs:
155203
${{ needs.changes.outputs.webknossos == 'true' }}
156204
runs-on: ubuntu-latest
157205
strategy:
158-
max-parallel: 4
159206
matrix:
160-
python-version: ["3.12", "3.13", "3.11", "3.10"]
207+
python-version: ["3.13", "3.12", "3.11", "3.10"]
161208
group: [1, 2, 3]
162209
fail-fast: false
163210
defaults:
@@ -177,7 +224,7 @@ jobs:
177224

178225
- name: Install proxay
179226
run: npm install -g proxay
180-
227+
181228
- name: Set up Python ${{ matrix.python-version }}
182229
run: uv python install ${{ matrix.python-version }}
183230

@@ -258,12 +305,17 @@ jobs:
258305
token: ${{ secrets.GITHUB_TOKEN }}
259306
thresholdAll: 0.8
260307
thresholdNew: 0.8
261-
308+
262309
- name: Cleanup temporary files
263310
run: rm -rf ~/coverage-files
264311

265312
webknossos_cli_docker:
266-
needs: [cluster_tools, webknossos_linux]
313+
needs:
314+
- cluster_tools_slurm
315+
- cluster_tools_multiprocessing
316+
- cluster_tools_kubernetes
317+
- cluster_tools_dask
318+
- webknossos_linux
267319
if: |
268320
always() &&
269321
!contains(needs.*.result, 'failure') &&
@@ -335,7 +387,12 @@ jobs:
335387
docker push scalableminds/webknossos-cli:$NORMALIZED_CI_BRANCH
336388
337389
docs:
338-
needs: [cluster_tools, webknossos_linux]
390+
needs:
391+
- cluster_tools_slurm
392+
- cluster_tools_multiprocessing
393+
- cluster_tools_kubernetes
394+
- cluster_tools_dask
395+
- webknossos_linux
339396
runs-on: ubuntu-latest
340397
if: |
341398
always() &&
@@ -391,7 +448,12 @@ jobs:
391448
"$SLACK_HOOK"
392449
393450
pypi_and_gh_release:
394-
needs: [cluster_tools, webknossos_linux]
451+
needs:
452+
- cluster_tools_slurm
453+
- cluster_tools_multiprocessing
454+
- cluster_tools_kubernetes
455+
- cluster_tools_dask
456+
- webknossos_linux
395457
if: |
396458
always() &&
397459
!contains(needs.*.result, 'failure') &&
@@ -429,7 +491,10 @@ jobs:
429491
complete:
430492
needs:
431493
[
432-
cluster_tools,
494+
cluster_tools_dask,
495+
cluster_tools_kubernetes,
496+
cluster_tools_multiprocessing,
497+
cluster_tools_slurm,
433498
webknossos_linux,
434499
webknossos_cli_docker,
435500
docs,

cluster_tools/Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
1414
### Added
1515

1616
### Changed
17+
- Use `sacct` to detect out of memory errors instead of `seff` for Slurm executor. [#1297](https://github.com/scalableminds/webknossos-libs/pull/1297)
1718

1819
### Fixed
1920

0 commit comments

Comments
 (0)