Skip to content

Commit 1f6ad1c

Browse files
Copilotmawad-amd
andauthored
Switch CI workflows to linux-mi325-8gpu-ossci-rad runner (#423)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
1 parent 91f92ed commit 1f6ad1c

File tree

5 files changed

+111
-90
lines changed

5 files changed

+111
-90
lines changed

.github/scripts/container_build.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,19 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
8383
echo "[INFO] Checking Docker images..."
8484
# Use GitHub variable if set, otherwise default to iris-dev
8585
IMAGE_NAME=${DOCKER_IMAGE_NAME:-"iris-dev"}
86-
86+
8787
# Check if the image exists
8888
if docker image inspect "$IMAGE_NAME" &> /dev/null; then
8989
echo "[INFO] Using existing Docker image: $IMAGE_NAME"
9090
else
91-
echo "[WARNING] Docker image $IMAGE_NAME not found"
92-
echo "[INFO] Please build it using: ./build_triton_image.sh"
93-
echo "[INFO] Or pull it if available from registry"
91+
echo "[INFO] Docker image $IMAGE_NAME not found, building..."
92+
DOCKER_DIR="$(dirname "$(realpath "$0")")/../../docker"
93+
if docker build -t "$IMAGE_NAME" "$DOCKER_DIR"; then
94+
echo "[INFO] Built Docker image: $IMAGE_NAME"
95+
else
96+
echo "[ERROR] Docker build failed"
97+
exit 1
98+
fi
9499
fi
95100
fi
96101

.github/scripts/container_exec.sh

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -107,24 +107,11 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
107107
fi
108108

109109
# Build run command with proper GPU access
110-
# Get video and render group IDs from host
111-
VIDEO_GID=$(getent group video | cut -d: -f3)
112-
RENDER_GID=$(getent group render | cut -d: -f3)
113-
114110
RUN_CMD="docker run --rm --network=host --device=/dev/kfd --device=/dev/dri"
115111
RUN_CMD="$RUN_CMD --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
116112
RUN_CMD="$RUN_CMD -v ${PWD}:/iris_workspace -w /iris_workspace"
117113
RUN_CMD="$RUN_CMD --shm-size=16G --ulimit memlock=-1 --ulimit stack=67108864"
118-
RUN_CMD="$RUN_CMD --user $(id -u):$(id -g)"
119-
120-
# Add video and render groups for GPU access
121-
if [ -n "$VIDEO_GID" ]; then
122-
RUN_CMD="$RUN_CMD --group-add $VIDEO_GID"
123-
fi
124-
if [ -n "$RENDER_GID" ]; then
125-
RUN_CMD="$RUN_CMD --group-add $RENDER_GID"
126-
fi
127-
114+
128115
RUN_CMD="$RUN_CMD -e HOME=/iris_workspace"
129116
RUN_CMD="$RUN_CMD --entrypoint bash"
130117

.github/workflows/iris-external-validation-test.yml

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ env:
1515
DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }}
1616

1717
jobs:
18-
build-container-image:
19-
runs-on: [self-hosted, mi3xx]
18+
external-validation-test:
19+
name: External Validation Test
20+
runs-on: [linux-mi325-8gpu-ossci-rad]
2021
timeout-minutes: 180
2122

2223
steps:
@@ -36,19 +37,8 @@ jobs:
3637
3738
- name: Build Iris container
3839
run: |
39-
# Use the universal container build script
4040
bash .github/scripts/container_build.sh
4141
42-
external-validation-test:
43-
name: External Validation Test
44-
needs: build-container-image
45-
runs-on: [self-hosted, mi3xx]
46-
timeout-minutes: 180
47-
48-
steps:
49-
- name: Checkout repository
50-
uses: actions/checkout@v4
51-
5242
- name: Acquire GPUs
5343
run: |
5444
bash .github/scripts/acquire_gpus.sh 2
@@ -77,13 +67,27 @@ jobs:
7767
7868
external-gluon-validation-test:
7969
name: External Gluon Validation Test
80-
needs: build-container-image
81-
runs-on: [self-hosted, mi3xx]
70+
runs-on: [linux-mi325-8gpu-ossci-rad]
8271

8372
steps:
8473
- name: Checkout repository
8574
uses: actions/checkout@v4
8675

76+
- name: Setup Apptainer (if not available)
77+
run: |
78+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
79+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
80+
apt-get update && apt-get install -y software-properties-common
81+
add-apt-repository -y ppa:apptainer/ppa
82+
apt-get update && apt-get install -y apptainer
83+
else
84+
echo "Container runtime already available"
85+
fi
86+
87+
- name: Build Iris container
88+
run: |
89+
bash .github/scripts/container_build.sh
90+
8791
- name: Acquire GPUs
8892
run: |
8993
bash .github/scripts/acquire_gpus.sh 2

.github/workflows/iris-performance-regression-test.yml

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,34 +15,9 @@ env:
1515
DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }}
1616

1717
jobs:
18-
build-container-image:
19-
runs-on: [self-hosted, mi3xx]
20-
timeout-minutes: 180
21-
22-
steps:
23-
- name: Checkout repository
24-
uses: actions/checkout@v4
25-
26-
- name: Setup Apptainer (if not available)
27-
run: |
28-
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
29-
echo "Neither Apptainer nor Docker found, installing Apptainer..."
30-
apt-get update && apt-get install -y software-properties-common
31-
add-apt-repository -y ppa:apptainer/ppa
32-
apt-get update && apt-get install -y apptainer
33-
else
34-
echo "Container runtime already available"
35-
fi
36-
37-
- name: Build Iris container
38-
run: |
39-
# Use the universal container build script
40-
bash .github/scripts/container_build.sh
41-
4218
performance-test:
4319
name: ${{ matrix.example_name }}
44-
needs: build-container-image
45-
runs-on: [self-hosted, mi3xx]
20+
runs-on: [linux-mi325-8gpu-ossci-rad]
4621
timeout-minutes: 180
4722
strategy:
4823
fail-fast: false
@@ -74,6 +49,21 @@ jobs:
7449
- name: Checkout repository
7550
uses: actions/checkout@v4
7651

52+
- name: Setup Apptainer (if not available)
53+
run: |
54+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
55+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
56+
apt-get update && apt-get install -y software-properties-common
57+
add-apt-repository -y ppa:apptainer/ppa
58+
apt-get update && apt-get install -y apptainer
59+
else
60+
echo "Container runtime already available"
61+
fi
62+
63+
- name: Build Iris container
64+
run: |
65+
bash .github/scripts/container_build.sh
66+
7767
- name: Acquire GPUs
7868
run: |
7969
bash .github/scripts/acquire_gpus.sh 8

.github/workflows/iris-tests.yml

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,9 @@ env:
1515
DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }}
1616

1717
jobs:
18-
build-container-image:
19-
runs-on: [self-hosted, mi3xx]
20-
timeout-minutes: 180
21-
22-
steps:
23-
- name: Checkout repository
24-
uses: actions/checkout@v4
25-
26-
- name: Setup Apptainer (if not available)
27-
run: |
28-
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
29-
echo "Neither Apptainer nor Docker found, installing Apptainer..."
30-
apt-get update && apt-get install -y software-properties-common
31-
add-apt-repository -y ppa:apptainer/ppa
32-
apt-get update && apt-get install -y apptainer
33-
else
34-
echo "Container runtime already available"
35-
fi
36-
37-
- name: Build Iris container
38-
run: |
39-
bash .github/scripts/container_build.sh
40-
4118
test-git:
4219
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, git install)
43-
needs: build-container-image
44-
runs-on: [self-hosted, mi3xx]
20+
runs-on: [linux-mi325-8gpu-ossci-rad]
4521
timeout-minutes: 180
4622
strategy:
4723
fail-fast: false
@@ -93,6 +69,21 @@ jobs:
9369
- name: Checkout repository
9470
uses: actions/checkout@v4
9571

72+
- name: Setup Apptainer (if not available)
73+
run: |
74+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
75+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
76+
apt-get update && apt-get install -y software-properties-common
77+
add-apt-repository -y ppa:apptainer/ppa
78+
apt-get update && apt-get install -y apptainer
79+
else
80+
echo "Container runtime already available"
81+
fi
82+
83+
- name: Build Iris container
84+
run: |
85+
bash .github/scripts/container_build.sh
86+
9687
- name: Acquire GPUs
9788
run: |
9889
bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
@@ -119,8 +110,8 @@ jobs:
119110
120111
test-editable:
121112
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install)
122-
needs: [build-container-image, test-git]
123-
runs-on: [self-hosted, mi3xx]
113+
needs: [test-git]
114+
runs-on: [linux-mi325-8gpu-ossci-rad]
124115
timeout-minutes: 180
125116
strategy:
126117
fail-fast: false
@@ -172,6 +163,21 @@ jobs:
172163
- name: Checkout repository
173164
uses: actions/checkout@v4
174165

166+
- name: Setup Apptainer (if not available)
167+
run: |
168+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
169+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
170+
apt-get update && apt-get install -y software-properties-common
171+
add-apt-repository -y ppa:apptainer/ppa
172+
apt-get update && apt-get install -y apptainer
173+
else
174+
echo "Container runtime already available"
175+
fi
176+
177+
- name: Build Iris container
178+
run: |
179+
bash .github/scripts/container_build.sh
180+
175181
- name: Acquire GPUs
176182
run: |
177183
bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
@@ -195,8 +201,8 @@ jobs:
195201
196202
test-install:
197203
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, pip install)
198-
needs: [build-container-image, test-editable]
199-
runs-on: [self-hosted, mi3xx]
204+
needs: [test-editable]
205+
runs-on: [linux-mi325-8gpu-ossci-rad]
200206
strategy:
201207
fail-fast: false
202208
matrix:
@@ -247,6 +253,21 @@ jobs:
247253
- name: Checkout repository
248254
uses: actions/checkout@v4
249255

256+
- name: Setup Apptainer (if not available)
257+
run: |
258+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
259+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
260+
apt-get update && apt-get install -y software-properties-common
261+
add-apt-repository -y ppa:apptainer/ppa
262+
apt-get update && apt-get install -y apptainer
263+
else
264+
echo "Container runtime already available"
265+
fi
266+
267+
- name: Build Iris container
268+
run: |
269+
bash .github/scripts/container_build.sh
270+
250271
- name: Acquire GPUs
251272
run: |
252273
bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
@@ -270,8 +291,7 @@ jobs:
270291
271292
test-new-examples:
272293
name: New examples (${{ matrix.num_ranks }} ranks, ${{ matrix.install_method }})
273-
needs: build-container-image
274-
runs-on: [self-hosted, mi3xx]
294+
runs-on: [linux-mi325-8gpu-ossci-rad]
275295
timeout-minutes: 180
276296
permissions:
277297
contents: read
@@ -290,6 +310,21 @@ jobs:
290310
- name: Checkout repository
291311
uses: actions/checkout@v4
292312

313+
- name: Setup Apptainer (if not available)
314+
run: |
315+
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
316+
echo "Neither Apptainer nor Docker found, installing Apptainer..."
317+
apt-get update && apt-get install -y software-properties-common
318+
add-apt-repository -y ppa:apptainer/ppa
319+
apt-get update && apt-get install -y apptainer
320+
else
321+
echo "Container runtime already available"
322+
fi
323+
324+
- name: Build Iris container
325+
run: |
326+
bash .github/scripts/container_build.sh
327+
293328
- name: Acquire GPUs
294329
run: |
295330
bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"

0 commit comments

Comments
 (0)