Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/cluster-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Workflow to test latest climate-aware-task-scheduler release against ephemeral Slurm cluster
name: cluster-tests

on:
push:
branches: [ main ]
paths:
- '.github/workflows/cluster-tests.yml'
- 'cluster/**'
- '!cluster/README.md'
pull_request:
branches: [ main ]
paths:
- '.github/workflows/cluster-tests.yml'
- 'cluster/**'
- '!cluster/README.md'
workflow_dispatch:

jobs:
build:

runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build slurm container
run: |
./cluster/clone.sh
./cluster/build.sh
- name: Run tests
run: |
sleep 30 # wait for cluster to come up
./cluster/tests.sh
109 changes: 109 additions & 0 deletions cluster/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Origin: https://github.com/giovtorres/slurm-docker-cluster/tree/c9aa93c080567121c6b28913152a1cd696465985
# Modified Dockerfile to install uv, cats and jq
FROM rockylinux:8

LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
org.opencontainers.image.title="slurm-docker-cluster" \
org.opencontainers.image.description="Slurm Docker cluster on Rocky Linux 8" \
org.label-schema.docker.cmd="docker-compose up -d" \
maintainer="Giovanni Torres"

RUN set -ex \
&& yum makecache \
&& yum -y update \
&& yum -y install dnf-plugins-core \
&& yum config-manager --set-enabled powertools \
&& yum -y install \
wget \
bzip2 \
perl \
gcc \
gcc-c++\
git \
gnupg \
make \
munge \
munge-devel \
python3-devel \
python3-pip \
python3 \
mariadb-server \
mariadb-devel \
psmisc \
bash-completion \
vim-enhanced \
http-parser-devel \
json-c-devel \
jq \
&& yum clean all \
&& rm -rf /var/cache/yum

RUN alternatives --set python /usr/bin/python3

RUN pip3 install Cython pytest

# install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh

ENV PATH="/root/.local/bin:${PATH}"

RUN uv tool install climate-aware-task-scheduler && cp /root/.local/bin/cats /usr/local/bin/

ARG GOSU_VERSION=1.17

RUN set -ex \
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
&& export GNUPGHOME="$(mktemp -d)" \
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
&& chmod +x /usr/local/bin/gosu \
&& gosu nobody true

ARG SLURM_TAG

RUN set -x \
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
&& pushd slurm \
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
&& make install \
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
&& popd \
&& rm -rf slurm \
&& groupadd -r --gid=990 slurm \
&& useradd -r -g slurm --uid=990 slurm \
&& mkdir /etc/sysconfig/slurm \
/var/spool/slurmd \
/var/run/slurmd \
/var/run/slurmdbd \
/var/lib/slurmd \
/var/log/slurm \
/data \
&& touch /var/lib/slurmd/node_state \
/var/lib/slurmd/front_end_state \
/var/lib/slurmd/job_state \
/var/lib/slurmd/resv_state \
/var/lib/slurmd/trigger_state \
/var/lib/slurmd/assoc_mgr_state \
/var/lib/slurmd/assoc_usage \
/var/lib/slurmd/qos_usage \
/var/lib/slurmd/fed_mgr_state \
&& chown -R slurm:slurm /var/*/slurm* \
&& /sbin/create-munge-key

COPY slurm.conf /etc/slurm/slurm.conf
COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
RUN set -x \
&& chown slurm:slurm /etc/slurm/slurmdbd.conf \
&& chmod 600 /etc/slurm/slurmdbd.conf


COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

CMD ["slurmdbd"]
43 changes: 43 additions & 0 deletions cluster/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Cluster tests

This folder contains scripts to setup an ephemeral SLURM cluster to test cats
in a more realistic setting than the current integration tests that use
macking. The setup builds upon work from upstream
https://github.com/giovtorres/slurm-docker-cluster with a patched
[Dockerfile](Dockerfile) that installs the latest release of CATS and makes it
available in the cluster.

## Setup

Clone this repository (GreenScheduler/cats) and then run

```shell
./cats/clone.sh
./cats/build.sh
```
to clone the slurm-docker-cluster repo, patch the Dockerfile to install CATS,
build and start the cluster. Note that this requires `docker` and `docker
compose` to be present. Currently this compiles a specific SLURM version, so
this may take a while on older computers. When developing locally, you should
only need to do this once, unless you update the Dockerfile.

Once the cluster is built and running, then you can run the following to get
access to the control node:

```shell
docker exec -it slurmctld bash
```

For more information about slurm-docker-cluster, consult the upstream
repository.

## Tests

An automated testing script is supplied which shows programmatic interaction
with the slurm cluster. Currently cats schedules a short job, and the slurm
`scontrol` output is checked to see that the job was scheduled correctly. To
run the test:

```shell
./cluster/tests.sh
```
9 changes: 9 additions & 0 deletions cluster/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Builds slurm-docker-cluster with patched Dockerfile that installs cats
set -eou pipefail
pushd slurm-docker-cluster
echo :: Patching Dockerfile with version that installs cats
cp ../cluster/Dockerfile .
docker compose build
docker compose up -d
popd
8 changes: 8 additions & 0 deletions cluster/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# Cleans up resources and shuts down containers, useful for local development of slurm-docker-cluster
set -eou pipefail

docker compose down
if [ -d slurm-docker-cluster ]; then
rm -r slurm-docker-cluster
fi
6 changes: 6 additions & 0 deletions cluster/clone.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -eou pipefail
git clone https://github.com/giovtorres/slurm-docker-cluster
pushd slurm-docker-cluster
git checkout c9aa93c080567121c6b28913152a1cd696465985
popd
46 changes: 46 additions & 0 deletions cluster/tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
# Run tests to check if slurm picks up begin time set by CATS
# This relies on a cluster already setup and running, if not run
# ./cluster/build.sh
set -eou pipefail

# Step a) Run cats inside the slurmctld container and extract start time
raw_output=$(docker exec -i slurmctld \
cats -d 5 --loc RG1 --scheduler=sbatch --command='ls' --format=json | \
awk 'BEGIN{found=0} {
if(!found){
i=index($0,"{");
if(i){ print substr($0,i); found=1 }
} else { print }
}')
job_id=$(echo "$raw_output" | grep ^Submitted | awk '{print $4}')
echo "Detected job submission ID: $job_id"
raw_json=$(echo "$raw_output" | grep -v ^Submitted)
raw_start=$(printf '%s\n' "$raw_json" | jq -r '.carbonIntensityOptimal.start')

# Replace seconds with 00 (truncate last 6 chars and add "00")
# Example: 2025-08-28T12:43:30.156434+00:00 -> 2025-08-28T12:43:00
scheduled_start=$(echo "$raw_start" | sed -E 's/:[0-9]{2}\..*/:00/')

echo "Expected scheduled start time: $scheduled_start"

# Step b) Fetch job details
job_output=$(docker exec -i slurmctld scontrol show job "$job_id")

# Check condition 1: job is pending for BeginTime
if ! echo "$job_output" | grep -q "JobState=PENDING Reason=BeginTime Dependency=(null)"; then
echo "❌ Job state/Reason is not correct!"
echo "$job_output"
exit 1
fi

# Check condition 2: start time matches
if ! echo "$job_output" | grep -q "StartTime=$scheduled_start"; then
echo "❌ Start time does not match expected!"
echo "Expected: StartTime=$scheduled_start"
echo "Actual output:"
echo "$job_output"
exit 1
fi

echo "✅ Job is correctly delayed until $scheduled_start"