44
55set -e
66
7+ # Configuration - can be overridden via environment variables
8+ : " ${PLUGIN_LIBEXEC_DIR:=/ usr/ libexec} "
9+ : " ${SLURM_SYSCONFDIR:=/ etc/ slurm} "
10+ : " ${SLURM_JOB_SPOOL:=/ var/ spool/ slurm-jobs} "
11+ : " ${SLURM_LOG_DIR:=/ var/ log/ slurm} "
12+ : " ${SLURM_PARTITION:= debug} "
13+ : " ${RETRY_TIMES:= 30} "
14+ : " ${RETRY_DELAY:= 2} "
15+ : " ${JOB_RETRY_DELAY:= 1} "
16+ : " ${JOB_MAX_WAIT:= 120} "
17+ : " ${JOB_POLL_INTERVAL:= 3} "
18+
19+ PLUGIN_SO=" ${PLUGIN_LIBEXEC_DIR} /slurm-singularity-exec.so"
20+ PLUGSTACK_CONF=" ${SLURM_SYSCONFDIR} /plugstack.conf.d/singularity-exec.conf"
21+
722echo " === Slurm Singularity Plugin Runtime Tests ==="
823echo
924
1025# Test 1: Verify plugin files are installed
1126echo " Test 1: Verifying plugin installation..."
12- if [ -f " /usr/libexec/slurm-singularity-exec.so " ]; then
13- echo " ✓ Found plugin library: /usr/libexec/slurm-singularity-exec.so "
27+ if [ -f " $PLUGIN_SO " ]; then
28+ echo " ✓ Found plugin library: $PLUGIN_SO "
1429else
15- echo " ✗ ERROR: Plugin library not found at /usr/libexec/slurm-singularity-exec.so "
30+ echo " ✗ ERROR: Plugin library not found at $PLUGIN_SO "
1631 exit 1
1732fi
1833
19- if [ -f " /etc/slurm/plugstack.conf.d/singularity-exec.conf " ]; then
20- echo " ✓ Found plugin config: /etc/slurm/plugstack.conf.d/singularity-exec.conf "
34+ if [ -f " $PLUGSTACK_CONF " ]; then
35+ echo " ✓ Found plugin config: $PLUGSTACK_CONF "
2136else
22- echo " ✗ ERROR: Plugin config not found at /etc/slurm/plugstack.conf.d/singularity-exec.conf "
37+ echo " ✗ ERROR: Plugin config not found at $PLUGSTACK_CONF "
2338 exit 1
2439fi
2540echo
8499if [ " $SKIP_CONTAINER_TEST " != " true" ]; then
85100 echo " Test 5: Creating a test container image..."
86101 # Use shared directory so container is accessible from both slurmctld and slurmd
87- TEST_CONTAINER=" /var/spool/slurm-jobs /test-debian.sif"
102+ TEST_CONTAINER=" ${SLURM_JOB_SPOOL} /test-debian.sif"
88103 if [ ! -f " $TEST_CONTAINER " ]; then
89104 # Create a minimal Debian container
90105 $SINGULARITY_CMD pull " $TEST_CONTAINER " docker://debian:stable-slim
102117
103118# Test 6: Wait for Slurm to be ready
104119echo " Test 6: Waiting for Slurm cluster to be ready..."
105- if ! retry --times=30 --delay=2 -- scontrol ping > /dev/null 2>&1 ; then
120+ if ! retry --times=" $RETRY_TIMES " --delay=" $RETRY_DELAY " -- scontrol ping > /dev/null 2>&1 ; then
106121 echo " ✗ ERROR: Slurm controller not responding"
107122 exit 1
108123fi
109124echo " ✓ Slurm controller is responding"
110125
111126# Wait for node to be ready
112- if ! retry --times=30 --delay=2 -- bash -c ' sinfo -h -o "%T" 2>/dev/null | grep -qE "idle|mixed|alloc"' ; then
127+ if ! retry --times=" $RETRY_TIMES " --delay=" $RETRY_DELAY " -- bash -c ' sinfo -h -o "%T" 2>/dev/null | grep -qE "idle|mixed|alloc"' ; then
113128 echo " ✗ ERROR: No compute nodes are ready"
114129 echo " Showing sinfo output:"
115130 sinfo
116131 echo
117132 echo " Showing last 50 lines of slurmd logs:"
118- tail -50 /var/log/slurm/ slurmd.log 2> /dev/null || echo " Could not read slurmd logs"
133+ tail -50 " ${SLURM_LOG_DIR} / slurmd.log" 2> /dev/null || echo " Could not read slurmd logs"
119134 echo
120135 echo " Showing last 50 lines of slurmctld logs:"
121- tail -50 /var/log/slurm/ slurmctld.log 2> /dev/null || echo " Could not read slurmctld logs"
136+ tail -50 " ${SLURM_LOG_DIR} / slurmctld.log" 2> /dev/null || echo " Could not read slurmctld logs"
122137 exit 1
123138fi
124139echo " ✓ Compute node is ready"
140155
141156# Wait for job to complete
142157echo " Waiting for job $TEST_JOB_ID to complete..."
143- retry --times=30 --delay=1 -- bash -c " scontrol show job $TEST_JOB_ID 2>/dev/null | grep -qE 'JobState=(COMPLETED|FAILED|CANCELLED)'" > /dev/null 2>&1
158+ retry --times=" $RETRY_TIMES " --delay=" $JOB_RETRY_DELAY " -- bash -c " scontrol show job $TEST_JOB_ID 2>/dev/null | grep -qE 'JobState=(COMPLETED|FAILED|CANCELLED)'" > /dev/null 2>&1
144159
145160JOB_STATE=$( scontrol show job " $TEST_JOB_ID " 2> /dev/null | grep " JobState" | awk ' {print $1}' | cut -d= -f2)
146161if [ " $JOB_STATE " = " COMPLETED" ]; then
@@ -158,24 +173,24 @@ echo
158173if [ " $SKIP_CONTAINER_TEST " != " true" ]; then
159174 echo " Test 8: Submitting a containerized test job..."
160175JOB_SCRIPT=$( mktemp /tmp/test_job.XXXXXX.sh)
161- cat > " $JOB_SCRIPT " << ' JOBEOF '
176+ cat > " $JOB_SCRIPT " << JOBEOF
162177#!/bin/bash
163178#SBATCH --job-name=test-singularity
164- #SBATCH --output=/var/spool/slurm-jobs /test_job_%j.out
165- #SBATCH --error=/var/spool/slurm-jobs /test_job_%j.err
166- #SBATCH --partition=debug
179+ #SBATCH --output=${SLURM_JOB_SPOOL} /test_job_%j.out
180+ #SBATCH --error=${SLURM_JOB_SPOOL} /test_job_%j.err
181+ #SBATCH --partition=${SLURM_PARTITION}
167182#SBATCH --time=00:01:00
168183#SBATCH --nodes=1
169184#SBATCH --ntasks=1
170185
171- echo "Job started at: $(date)"
172- echo "Running on node: $(hostname)"
173- echo "Job ID: $SLURM_JOB_ID"
186+ echo "Job started at: \ $ (date)"
187+ echo "Running on node: \ $ (hostname)"
188+ echo "Job ID: \ $ SLURM_JOB_ID"
174189
175190# Test command inside container
176191cat /etc/os-release | grep -i pretty
177192
178- echo "Job completed at: $(date)"
193+ echo "Job completed at: \ $ (date)"
179194JOBEOF
180195
181196chmod +x " $JOB_SCRIPT "
@@ -192,39 +207,38 @@ echo
192207
193208# Test 9: Wait for job to complete
194209echo " Test 9: Waiting for job to complete..."
195- max_wait=120
196210waited=0
197211while true ; do
198212 JOB_STATE=$( scontrol show job " $JOB_ID " 2> /dev/null | grep " JobState=" | sed ' s/.*JobState=\([^ ]*\).*/\1/' )
199-
213+
200214 if [ " $JOB_STATE " = " COMPLETED" ]; then
201215 echo " ✓ Job completed successfully"
202216 break
203217 elif [ " $JOB_STATE " = " FAILED" ] || [ " $JOB_STATE " = " CANCELLED" ] || [ " $JOB_STATE " = " TIMEOUT" ]; then
204218 echo " ✗ ERROR: Job failed with state: $JOB_STATE "
205219 scontrol show job " $JOB_ID "
206220 exit 1
207- elif [ $waited -ge $max_wait ]; then
208- echo " ✗ ERROR: Job did not complete within ${max_wait } s"
221+ elif [ $waited -ge $JOB_MAX_WAIT ]; then
222+ echo " ✗ ERROR: Job did not complete within ${JOB_MAX_WAIT } s"
209223 scontrol show job " $JOB_ID "
210224 scancel " $JOB_ID "
211225 exit 1
212226 fi
213-
214- echo " Job state: $JOB_STATE (${waited} s/${max_wait } s)"
215- sleep 3
216- waited=$(( waited + 3 ))
227+
228+ echo " Job state: $JOB_STATE (${waited} s/${JOB_MAX_WAIT } s)"
229+ sleep " $JOB_POLL_INTERVAL "
230+ waited=$(( waited + JOB_POLL_INTERVAL ))
217231done
218232echo
219233
220234# Test 10: Check job output
221235echo " Test 10: Checking job output..."
222- JOB_OUTPUT=" /var/spool/slurm-jobs /test_job_${JOB_ID} .out"
236+ JOB_OUTPUT=" ${SLURM_JOB_SPOOL} /test_job_${JOB_ID} .out"
223237if [ -f " $JOB_OUTPUT " ]; then
224238 echo " Job output:"
225239 cat " $JOB_OUTPUT "
226240 echo
227-
241+
228242 if grep -q " PRETTY_NAME" " $JOB_OUTPUT " ; then
229243 echo " ✓ Job produced expected output (found PRETTY_NAME)"
230244 else
0 commit comments