@@ -15,18 +15,17 @@ concurrency:
1515 cancel-in-progress : true
1616
1717env :
18- OUTPUT_PATH : ${{ github.workspace }}
1918 RESOURCE_GROUP : CI-gpu
2019
2120on :
2221 push :
23- branches : [ main ]
22+ branches : [main]
2423 pull_request :
25- branches : [ main ]
24+ branches : [main]
2625 workflow_dispatch :
2726 inputs :
2827 tags :
29- description : ' Run GPU tests'
28+ description : " Run GPU tests"
3029
3130jobs :
3231 build :
@@ -35,66 +34,63 @@ jobs:
3534 - self-hosted
3635 - ${{ matrix.runner_label }}
3736
38- # Job-level env (includes per-runner image/container tags)
39- env :
40- DEVITO_ARCH : ${{ matrix.arch }}
41- DEVITO_PLATFORM : ${{ matrix.platform }}
42- DEVITO_LANGUAGE : ${{ matrix.language }}
43- OMPI_CC : ${{ matrix.arch }}
44-
4537 strategy :
4638 fail-fast : false
4739 matrix :
48- name : [
49- pytest-gpu-acc-nvidia,
50- pytest-gpu-omp-amd
51- ]
40+ name : [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
5241 test_examples : ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
5342
5443 include :
55- # -------------------- NVIDIA job --------------------
56- - name : pytest-gpu-acc-nvidia
57- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
58- base : " devitocodes/bases:nvidia-nvc"
59- runner_label : nvidiagpu
60- test_drive_cmd : " nvidia-smi"
61- # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
62- # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
63- flags : >-
64- --init --rm -t
65- --name ${CONTAINER_BASENAME}
66- --env CUDA_VISIBLE_DEVICES
67- --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
68-
69- # -------------------- AMD job -----------------------
70- - name : pytest-gpu-omp-amd
71- test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
72- runner_label : amdgpu
73- base : " devitocodes/bases:amd"
74- test_drive_cmd : " rocm-smi"
75- # Unchanged, still passes through required /dev nodes etc.
76- flags : >-
77- --init --network=host
78- --device=/dev/kfd --device=/dev/dri
79- --ipc=host
80- --group-add video --group-add $(getent group render | cut -d: -f3)
81- --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
82- --rm -t
83- --name ${CONTAINER_BASENAME}
44+ # -------------------- NVIDIA job --------------------
45+ - name : pytest-gpu-acc-nvidia
46+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
47+ base : " devitocodes/bases:nvidia-nvc"
48+ runner_label : nvidiagpu
49+ test_drive_cmd : " nvidia-smi"
50+ # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
51+ # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
52+ flags : >-
53+ --init --rm -t
54+ --name ${CONTAINER_BASENAME}
55+ --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
56+
57+ # -------------------- AMD job -----------------------
58+ - name : pytest-gpu-omp-amd
59+ test_files : " tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
60+ runner_label : amdgpu
61+ base : " devitocodes/bases:amd"
62+ test_drive_cmd : " rocm-smi"
63+ # Unchanged, still passes through required /dev nodes etc.
64+ flags : >-
65+ --init --network=host
66+ --device=/dev/kfd --device=/dev/dri
67+ --ipc=host
68+ --group-add video --group-add "$(getent group render | cut -d: -f3)"
69+ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
70+ --rm -t
71+ --name ${CONTAINER_BASENAME}
8472
8573 steps :
8674 - name : Checkout devito
8775 uses : actions/checkout@v4
8876
8977 - name : Set per-runner tags
9078 run : |
91- echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
9279 echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
9380 echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
9481
82+ - name : Ensure buildx builder
83+ run : |
84+ docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
85+ docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
86+ docker buildx use "${RUNNER_NAME// /_}"
87+
9588 - name : Build docker image
9689 run : |
97- docker build . \
90+ docker buildx build . \
91+ --builder "${RUNNER_NAME// /_}" \
92+ --load \
93+ --label ci-run=$GITHUB_RUN_ID \
9894 --rm --pull \
9995 --file docker/Dockerfile.devito \
10096 --tag "${DOCKER_IMAGE}" \
@@ -105,16 +101,37 @@ jobs:
105101
106102 - name : Probe gpu
107103 run : |
108- # Run a simple driver cmd first (nvidia-smi / rocm-smi)
104+ # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
105+ # runners; fall back to "all" so the driver probe does not fail.
106+ if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
107+ echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
108+ fi
109+
110+ # Run a simple driver-probe command (nvidia-smi / rocm-smi)
111+ docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
109112 docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
110113
111114 - name : Test with pytest
115+ env :
116+ # Exported earlier in the job; needed inside the container for codecov
117+ CODECOV_TOKEN : ${{ env.CODECOV_TOKEN }}
112118 run : |
113- # Run a simple driver cmd first (nvidia-smi / rocm-smi )
119+ # Add Codecov’s environment variables (GITHUB_SHA, etc. )
114120 ci_env=$(bash <(curl -s https://codecov.io/env))
115121
116- docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
117- pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
122+ # Run the test suite using the matrix-defined flags
123+ docker run ${{ matrix.flags }} \
124+ ${ci_env} \
125+ -e CI=true \
126+ -e PYTHONFAULTHANDLER=1 \
127+ -e DEVITO_LOGGING=DEBUG \
128+ -e CODECOV_TOKEN \
129+ "${DOCKER_IMAGE}" \
130+ pytest -vvv --capture=no --showlocals \
131+ --log-cli-level=DEBUG -o log_cli=true \
132+ --full-trace --durations=10 \
133+ --cov --cov-config=.coveragerc --cov-report=xml \
134+ ${{ matrix.test_files }}
118135
119136 - name : Test examples
120137 run : |
@@ -124,3 +141,17 @@ jobs:
124141 run : |
125142 docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
126143 mpiexec -n 2 pytest ${{ matrix.test_examples }}
144+
145+ - name : Builder & image cleanup (keep 3 days of cache)
146+ if : always()
147+ run : |
148+ # Remove only the test image we built
149+ docker rmi -f "${DOCKER_IMAGE}" || true
150+
151+ # Classic image layers created in this job
152+ docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
153+
154+ # BuildKit cache: target the per-runner builder explicitly
155+ docker builder prune --builder "${RUNNER_NAME// /_}" \
156+ -f \
157+ --filter "until=72h"
0 commit comments