Skip to content

Commit 1c6040a

Browse files
authored
Merge pull request #2686 from devitocodes/prune-dangling
Remove dangling layers.
2 parents 5f399da + 6e3ad7e commit 1c6040a

File tree

2 files changed

+81
-51
lines changed

2 files changed

+81
-51
lines changed

.github/workflows/docker-devito.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ jobs:
109109
build-args: base=devitocodes/${{ matrix.base }}
110110

111111
- name: Remove dangling layers
112-
if: ${{ !contains(matrix.runner, 'nvidiagpu') }}
113112
run: docker system prune -f
114113

115114
- name: Run tests

.github/workflows/pytest-gpu.yml

Lines changed: 81 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,17 @@ concurrency:
1515
cancel-in-progress: true
1616

1717
env:
18-
OUTPUT_PATH: ${{ github.workspace }}
1918
RESOURCE_GROUP: CI-gpu
2019

2120
on:
2221
push:
23-
branches: [ main ]
22+
branches: [main]
2423
pull_request:
25-
branches: [ main ]
24+
branches: [main]
2625
workflow_dispatch:
2726
inputs:
2827
tags:
29-
description: 'Run GPU tests'
28+
description: "Run GPU tests"
3029

3130
jobs:
3231
build:
@@ -35,66 +34,63 @@ jobs:
3534
- self-hosted
3635
- ${{ matrix.runner_label }}
3736

38-
# Job-level env (includes per-runner image/container tags)
39-
env:
40-
DEVITO_ARCH: ${{ matrix.arch }}
41-
DEVITO_PLATFORM: ${{ matrix.platform }}
42-
DEVITO_LANGUAGE: ${{ matrix.language }}
43-
OMPI_CC: ${{ matrix.arch }}
44-
4537
strategy:
4638
fail-fast: false
4739
matrix:
48-
name: [
49-
pytest-gpu-acc-nvidia,
50-
pytest-gpu-omp-amd
51-
]
40+
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
5241
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
5342

5443
include:
55-
# -------------------- NVIDIA job --------------------
56-
- name: pytest-gpu-acc-nvidia
57-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
58-
base: "devitocodes/bases:nvidia-nvc"
59-
runner_label: nvidiagpu
60-
test_drive_cmd: "nvidia-smi"
61-
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
62-
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
63-
flags: >-
64-
--init --rm -t
65-
--name ${CONTAINER_BASENAME}
66-
--env CUDA_VISIBLE_DEVICES
67-
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
68-
69-
# -------------------- AMD job -----------------------
70-
- name: pytest-gpu-omp-amd
71-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
72-
runner_label: amdgpu
73-
base: "devitocodes/bases:amd"
74-
test_drive_cmd: "rocm-smi"
75-
# Unchanged, still passes through required /dev nodes etc.
76-
flags: >-
77-
--init --network=host
78-
--device=/dev/kfd --device=/dev/dri
79-
--ipc=host
80-
--group-add video --group-add $(getent group render | cut -d: -f3)
81-
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
82-
--rm -t
83-
--name ${CONTAINER_BASENAME}
44+
# -------------------- NVIDIA job --------------------
45+
- name: pytest-gpu-acc-nvidia
46+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
47+
base: "devitocodes/bases:nvidia-nvc"
48+
runner_label: nvidiagpu
49+
test_drive_cmd: "nvidia-smi"
50+
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
51+
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
52+
flags: >-
53+
--init --rm -t
54+
--name ${CONTAINER_BASENAME}
55+
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
56+
57+
# -------------------- AMD job -----------------------
58+
- name: pytest-gpu-omp-amd
59+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
60+
runner_label: amdgpu
61+
base: "devitocodes/bases:amd"
62+
test_drive_cmd: "rocm-smi"
63+
# Unchanged, still passes through required /dev nodes etc.
64+
flags: >-
65+
--init --network=host
66+
--device=/dev/kfd --device=/dev/dri
67+
--ipc=host
68+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
69+
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
70+
--rm -t
71+
--name ${CONTAINER_BASENAME}
8472
8573
steps:
8674
- name: Checkout devito
8775
uses: actions/checkout@v4
8876

8977
- name: Set per-runner tags
9078
run: |
91-
echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
9279
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
9380
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
9481
82+
- name: Ensure buildx builder
83+
run: |
84+
docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \
85+
docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container
86+
docker buildx use "${RUNNER_NAME// /_}"
87+
9588
- name: Build docker image
9689
run: |
97-
docker build . \
90+
docker buildx build . \
91+
--builder "${RUNNER_NAME// /_}" \
92+
--load \
93+
--label ci-run=$GITHUB_RUN_ID \
9894
--rm --pull \
9995
--file docker/Dockerfile.devito \
10096
--tag "${DOCKER_IMAGE}" \
@@ -105,16 +101,37 @@ jobs:
105101

106102
- name: Probe gpu
107103
run: |
108-
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
104+
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
105+
# runners; fall back to "all" so the driver probe does not fail.
106+
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
107+
echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
108+
fi
109+
110+
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
111+
docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true
109112
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
110113
111114
- name: Test with pytest
115+
env:
116+
# Exported earlier in the job; needed inside the container for codecov
117+
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
112118
run: |
113-
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
119+
# Add Codecov’s environment variables (GITHUB_SHA, etc.)
114120
ci_env=$(bash <(curl -s https://codecov.io/env))
115121
116-
docker run ${{ matrix.flags }} $ci_env -e CI=true "${DOCKER_IMAGE}" \
117-
pytest --cov --cov-config=.coveragerc --cov-report=xml ${{ matrix.test_files }}
122+
# Run the test suite using the matrix-defined flags
123+
docker run ${{ matrix.flags }} \
124+
${ci_env} \
125+
-e CI=true \
126+
-e PYTHONFAULTHANDLER=1 \
127+
-e DEVITO_LOGGING=DEBUG \
128+
-e CODECOV_TOKEN \
129+
"${DOCKER_IMAGE}" \
130+
pytest -vvv --capture=no --showlocals \
131+
--log-cli-level=DEBUG -o log_cli=true \
132+
--full-trace --durations=10 \
133+
--cov --cov-config=.coveragerc --cov-report=xml \
134+
${{ matrix.test_files }}
118135
119136
- name: Test examples
120137
run: |
@@ -124,3 +141,17 @@ jobs:
124141
run: |
125142
docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
126143
mpiexec -n 2 pytest ${{ matrix.test_examples }}
144+
145+
- name: Builder & image cleanup (keep 3 days of cache)
146+
if: always()
147+
run: |
148+
# Remove only the test image we built
149+
docker rmi -f "${DOCKER_IMAGE}" || true
150+
151+
# Classic image layers created in this job
152+
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
153+
154+
# BuildKit cache: target the per-runner builder explicitly
155+
docker builder prune --builder "${RUNNER_NAME// /_}" \
156+
-f \
157+
--filter "until=72h"

0 commit comments

Comments
 (0)