Skip to content

Commit 0f1c28f

Browse files
authored
Merge branch 'main' into prolog-epilog-skip
2 parents abcf18e + 530586b commit 0f1c28f

File tree

2 files changed

+25
-11
lines changed

2 files changed

+25
-11
lines changed

checks/apps/cp2k/cp2k_uenv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515

1616
cp2k_references = {
1717
'md': {
18-
'gh200': {'time_run': (69, None, 0.05, 's')},
18+
'gh200': {'time_run': (45, None, 0.05, 's')},
1919
'zen2': {'time_run': (91, None, 0.05, 's')}
2020
},
2121
'pbe': {
22-
'gh200': {'time_run': (67, None, 0.05, 's')},
22+
'gh200': {'time_run': (50, None, 0.05, 's')},
2323
'zen2': {'time_run': (68, None, 0.05, 's')}
2424
},
2525
'rpa': {
Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,33 @@
11
#!/bin/bash
22

3-
set -u
3+
set -eu
44

5-
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
6-
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
7-
export CUDA_VISIBLE_DEVICES=$(( SLURM_LOCALID % 4 ))
5+
export CUDA_DEVICE_MAX_COPY_CONNECTIONS=8
6+
export CUDA_DEVICE_MAX_CONNECTIONS=8
87

9-
if [ "${SLURM_LOCALID}" -eq 0 ]; then
10-
CUDA_VISIBLE_DEVICES=0,1,2,3 nvidia-cuda-mps-control -d
8+
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps-$((SLURM_LOCALID % 4))
9+
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log-$((SLURM_LOCALID % 4))-$(id -un)
10+
11+
export HWLOC_KEEP_NVIDIA_GPU_NUMA_NODES=0
12+
numa_nodes=$(hwloc-calc --physical --intersect NUMAnode $(hwloc-bind --get --taskset)) # do not set CUDA_VISIBLE_DEVICES, enough to set it for the daemon
13+
14+
# Launch MPS from a single rank per GPU
15+
if [[ $SLURM_LOCALID -lt 4 ]]; then
16+
mkdir -p ${CUDA_MPS_PIPE_DIRECTORY}
17+
mkdir -p ${CUDA_MPS_LOG_DIRECTORY}
18+
CUDA_VISIBLE_DEVICES=$((SLURM_LOCALID % 4)) nvidia-cuda-mps-control -d
1119
fi
1220

13-
sleep 5
21+
# Wait for MPS to start
22+
sleep 1
1423

15-
exec "$@"
24+
# Run the command
25+
"$@"
26+
result=$?
1627

17-
if [ "${SLURM_LOCALID}" -eq 0 ]; then
28+
# Quit MPS control daemon before exiting
29+
if [[ $SLURM_LOCALID -lt 4 ]]; then
1830
echo quit | nvidia-cuda-mps-control
1931
fi
32+
33+
exit $result

0 commit comments

Comments
 (0)