Skip to content

Commit cdc05dc

Browse files
authored
Add regression CI (#206)
1 parent 5dbc2fc commit cdc05dc

File tree

5 files changed

+164
-0
lines changed

5 files changed

+164
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Arguments
5+
EXAMPLE_PATH=$1
6+
TFLOPS_THRESHOLD=$2
7+
shift 2
8+
BENCHMARK_ARGS="$@"
9+
10+
# Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
11+
OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"
12+
13+
echo "::group::Creating overlay image"
14+
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
15+
echo "::endgroup::"
16+
17+
echo "::group::Running performance benchmark"
18+
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
19+
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
20+
~/apptainer/iris-dev.sif bash -c "
21+
set -e
22+
pip install -e .
23+
python examples/${EXAMPLE_PATH}/benchmark.py \
24+
--benchmark \
25+
--validate \
26+
-r 8 \
27+
${BENCHMARK_ARGS} \
28+
--output_file perf_result.json
29+
"
30+
echo "::endgroup::"
31+
32+
# Parse JSON and check performance
33+
echo "::group::Validating performance"
34+
35+
# Check if benchmark succeeded
36+
SUCCESS=$(jq -r '.success' perf_result.json)
37+
if [ "$SUCCESS" != "true" ]; then
38+
echo "::error::Benchmark failed (success: $SUCCESS)"
39+
jq '.' perf_result.json
40+
exit 1
41+
fi
42+
43+
TFLOPS=$(jq -r '.tflops' perf_result.json)
44+
45+
if [ -z "$TFLOPS" ] || [ "$TFLOPS" = "null" ]; then
46+
echo "::error::Failed to extract tflops from benchmark output"
47+
jq '.' perf_result.json
48+
exit 1
49+
fi
50+
51+
echo "::notice::Achieved TFLOPs: $TFLOPS"
52+
53+
# Convert to integer for comparison
54+
TFLOPS_INT=${TFLOPS%.*}
55+
if (( TFLOPS_INT < TFLOPS_THRESHOLD )); then
56+
echo "::error::Performance regression detected! TFLOPs ($TFLOPS) is below threshold ($TFLOPS_THRESHOLD)"
57+
jq '.' perf_result.json
58+
exit 1
59+
fi
60+
61+
echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
62+
echo "::endgroup::"
63+

.github/workflows/iris-external-validation-test.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ jobs:
5151

5252
- name: Run External Validation Test with Apptainer
5353
run: |
54+
set -e
55+
5456
# Create unique overlay image for isolation
5557
OVERLAY="/tmp/iris_overlay_$(whoami)_external_$(date +%s%N).img"
5658
@@ -62,6 +64,7 @@ jobs:
6264
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
6365
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
6466
~/apptainer/iris-dev.sif bash -c "
67+
set -e
6568
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
6669
wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
6770
python test_iris_distributed.py
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: Iris Performance Regression Test
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
workflow_dispatch:
9+
10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
12+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
13+
14+
jobs:
15+
build-apptainer-image:
16+
runs-on: [self-hosted, mi3008x]
17+
timeout-minutes: 20
18+
19+
steps:
20+
- name: Checkout repository
21+
uses: actions/checkout@v4
22+
23+
- name: Setup Apptainer
24+
run: |
25+
apt-get update && apt-get install -y software-properties-common
26+
add-apt-repository -y ppa:apptainer/ppa
27+
apt-get update && apt-get install -y apptainer
28+
29+
- name: Build Iris Apptainer container
30+
run: |
31+
# Create persistent Apptainer directory
32+
mkdir -p ~/apptainer
33+
34+
# Build Apptainer image from definition file (only if it doesn't exist)
35+
if [ ! -f ~/apptainer/iris-dev.sif ]; then
36+
echo "Building new Apptainer image..."
37+
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
38+
else
39+
echo "Using existing Apptainer image"
40+
fi
41+
42+
performance-test:
43+
name: ${{ matrix.example_name }}
44+
needs: build-apptainer-image
45+
runs-on: [self-hosted, mi3008x]
46+
timeout-minutes: 30
47+
strategy:
48+
fail-fast: false
49+
matrix:
50+
# Performance baselines measured on AMD Instinct MI325X (8 GPUs)
51+
include:
52+
- example_name: "GEMM All-Scatter WG Specialization"
53+
example_path: "10_gemm_all_scatter_wg_specialization"
54+
tflops_threshold: 1600 # Actual: ~2182 TFLOPs
55+
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
56+
57+
- example_name: "GEMM All-Scatter"
58+
example_path: "07_gemm_all_scatter"
59+
tflops_threshold: 1000 # Actual: ~1407 TFLOPs
60+
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
61+
62+
- example_name: "GEMM All-Scatter Producer-Consumer"
63+
example_path: "11_gemm_all_scatter_producer_consumer"
64+
tflops_threshold: 1600 # Actual: ~2190 TFLOPs
65+
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48"
66+
67+
- example_name: "GEMM All-Scatter Bulk Synchronous"
68+
example_path: "12_gemm_all_scatter_bulk_synchronous"
69+
tflops_threshold: 900 # Actual: ~1262 TFLOPs
70+
benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
71+
72+
steps:
73+
- name: Checkout repository
74+
uses: actions/checkout@v4
75+
76+
- name: Run ${{ matrix.example_name }} Benchmark (8 ranks)
77+
run: |
78+
bash .github/scripts/run_perf_benchmark.sh \
79+
"${{ matrix.example_path }}" \
80+
"${{ matrix.tflops_threshold }}" \
81+
${{ matrix.benchmark_args }}
82+

.github/workflows/iris-pip-install-test.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ jobs:
5252

5353
- name: Run pip install tests for 1, 2, 4 ranks in parallel
5454
run: |
55+
set -e
56+
5557
# Run tests in parallel with different GPU assignments
5658
# Note: Each test gets 2+ GPUs even if it only uses some of them.
5759
# This allows tests like test_empty_device_handling to verify that
@@ -73,6 +75,7 @@ jobs:
7375
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
7476
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
7577
~/apptainer/iris-dev.sif bash -c "
78+
set -e
7679
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
7780
bash .github/scripts/run_tests.sh 1
7881
" &
@@ -82,6 +85,7 @@ jobs:
8285
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
8386
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
8487
~/apptainer/iris-dev.sif bash -c "
88+
set -e
8589
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
8690
bash .github/scripts/run_tests.sh 2
8791
" &
@@ -91,6 +95,7 @@ jobs:
9195
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
9296
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
9397
~/apptainer/iris-dev.sif bash -c "
98+
set -e
9499
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
95100
bash .github/scripts/run_tests.sh 4
96101
" &
@@ -133,6 +138,8 @@ jobs:
133138

134139
- name: Run 8-rank pip install test
135140
run: |
141+
set -e
142+
136143
# Create unique overlay image for isolation
137144
OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
138145
@@ -144,6 +151,7 @@ jobs:
144151
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
145152
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
146153
~/apptainer/iris-dev.sif bash -c "
154+
set -e
147155
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
148156
bash .github/scripts/run_tests.sh 8
149157
"

.github/workflows/iris-tests-apptainer.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ jobs:
5050

5151
- name: Run 1, 2, 4 rank tests in parallel
5252
run: |
53+
set -e
54+
5355
# Run tests in parallel with different GPU assignments
5456
# Note: Each test gets 2+ GPUs even if it only uses some of them.
5557
# This allows tests like test_empty_device_handling to verify that
@@ -71,6 +73,7 @@ jobs:
7173
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
7274
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
7375
~/apptainer/iris-dev.sif bash -c "
76+
set -e
7477
pip install -e .
7578
bash .github/scripts/run_tests.sh 1
7679
" &
@@ -80,6 +83,7 @@ jobs:
8083
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
8184
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
8285
~/apptainer/iris-dev.sif bash -c "
86+
set -e
8387
pip install -e .
8488
bash .github/scripts/run_tests.sh 2
8589
" &
@@ -89,6 +93,7 @@ jobs:
8993
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
9094
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
9195
~/apptainer/iris-dev.sif bash -c "
96+
set -e
9297
pip install -e .
9398
bash .github/scripts/run_tests.sh 4
9499
" &
@@ -129,6 +134,8 @@ jobs:
129134

130135
- name: Run 8-rank test
131136
run: |
137+
set -e
138+
132139
# Create unique overlay image for isolation
133140
OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
134141
@@ -140,6 +147,7 @@ jobs:
140147
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
141148
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
142149
~/apptainer/iris-dev.sif bash -c "
150+
set -e
143151
pip install -e .
144152
bash .github/scripts/run_tests.sh 8
145153
"

0 commit comments

Comments
 (0)