Add regression CI (#206)

mawad-amd · web-flow · commit cdc05dc79147 · 2025-10-09T10:05:07.000-07:00
diff --git a/.github/scripts/run_perf_benchmark.sh b/.github/scripts/run_perf_benchmark.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+set -e
+
+# Arguments
+EXAMPLE_PATH=$1
+TFLOPS_THRESHOLD=$2
+shift 2
+BENCHMARK_ARGS="$@"
+
+# Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
+OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"
+
+echo "::group::Creating overlay image"
+apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
+echo "::endgroup::"
+
+echo "::group::Running performance benchmark"
+apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+  --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
+  ~/apptainer/iris-dev.sif bash -c "
+    set -e
+    pip install -e .
+    python examples/${EXAMPLE_PATH}/benchmark.py \
+      --benchmark \
+      --validate \
+      -r 8 \
+      ${BENCHMARK_ARGS} \
+      --output_file perf_result.json
+  "
+echo "::endgroup::"
+
+# Parse JSON and check performance
+echo "::group::Validating performance"
+
+# Check if benchmark succeeded
+SUCCESS=$(jq -r '.success' perf_result.json)
+if [ "$SUCCESS" != "true" ]; then
+  echo "::error::Benchmark failed (success: $SUCCESS)"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+TFLOPS=$(jq -r '.tflops' perf_result.json)
+
+if [ -z "$TFLOPS" ] || [ "$TFLOPS" = "null" ]; then
+  echo "::error::Failed to extract tflops from benchmark output"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+echo "::notice::Achieved TFLOPs: $TFLOPS"
+
+# Convert to integer for comparison
+TFLOPS_INT=${TFLOPS%.*}
+if (( TFLOPS_INT < TFLOPS_THRESHOLD )); then
+  echo "::error::Performance regression detected! TFLOPs ($TFLOPS) is below threshold ($TFLOPS_THRESHOLD)"
+  jq '.' perf_result.json
+  exit 1
+fi
+
+echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
+echo "::endgroup::"
+
diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml
@@ -51,6 +51,8 @@ jobs:
 
       - name: Run External Validation Test with Apptainer
         run: |
+          set -e
+
           # Create unique overlay image for isolation
           OVERLAY="/tmp/iris_overlay_$(whoami)_external_$(date +%s%N).img"
 
@@ -62,6 +64,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
               python test_iris_distributed.py
diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml
@@ -0,0 +1,82 @@
+name: Iris Performance Regression Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  build-apptainer-image:
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 20
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Apptainer
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:apptainer/ppa
+          apt-get update && apt-get install -y apptainer
+
+      - name: Build Iris Apptainer container
+        run: |
+          # Create persistent Apptainer directory
+          mkdir -p ~/apptainer
+
+          # Build Apptainer image from definition file (only if it doesn't exist)
+          if [ ! -f ~/apptainer/iris-dev.sif ]; then
+            echo "Building new Apptainer image..."
+            apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
+          else
+            echo "Using existing Apptainer image"
+          fi
+
+  performance-test:
+    name: ${{ matrix.example_name }}
+    needs: build-apptainer-image
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        # Performance baselines measured on AMD Instinct MI325X (8 GPUs)
+        include:
+          - example_name: "GEMM All-Scatter WG Specialization"
+            example_path: "10_gemm_all_scatter_wg_specialization"
+            tflops_threshold: 1600  # Actual: ~2182 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+          - example_name: "GEMM All-Scatter"
+            example_path: "07_gemm_all_scatter"
+            tflops_threshold: 1000  # Actual: ~1407 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+          - example_name: "GEMM All-Scatter Producer-Consumer"
+            example_path: "11_gemm_all_scatter_producer_consumer"
+            tflops_threshold: 1600  # Actual: ~2190 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48"
+
+          - example_name: "GEMM All-Scatter Bulk Synchronous"
+            example_path: "12_gemm_all_scatter_bulk_synchronous"
+            tflops_threshold: 900  # Actual: ~1262 TFLOPs
+            benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Run ${{ matrix.example_name }} Benchmark (8 ranks)
+        run: |
+          bash .github/scripts/run_perf_benchmark.sh \
+            "${{ matrix.example_path }}" \
+            "${{ matrix.tflops_threshold }}" \
+            ${{ matrix.benchmark_args }}
+
diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml
@@ -52,6 +52,8 @@ jobs:
 
       - name: Run pip install tests for 1, 2, 4 ranks in parallel
         run: |
+          set -e
+
           # Run tests in parallel with different GPU assignments
           # Note: Each test gets 2+ GPUs even if it only uses some of them.
           # This allows tests like test_empty_device_handling to verify that
@@ -73,6 +75,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               bash .github/scripts/run_tests.sh 1
             " &
@@ -82,6 +85,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               bash .github/scripts/run_tests.sh 2
             " &
@@ -91,6 +95,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               bash .github/scripts/run_tests.sh 4
             " &
@@ -133,6 +138,8 @@ jobs:
 
       - name: Run 8-rank pip install test
         run: |
+          set -e
+
           # Create unique overlay image for isolation
           OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
 
@@ -144,6 +151,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               bash .github/scripts/run_tests.sh 8
             "
diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml
@@ -50,6 +50,8 @@ jobs:
 
       - name: Run 1, 2, 4 rank tests in parallel
         run: |
+          set -e
+
           # Run tests in parallel with different GPU assignments
           # Note: Each test gets 2+ GPUs even if it only uses some of them.
           # This allows tests like test_empty_device_handling to verify that
@@ -71,6 +73,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install -e .
               bash .github/scripts/run_tests.sh 1
             " &
@@ -80,6 +83,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install -e .
               bash .github/scripts/run_tests.sh 2
             " &
@@ -89,6 +93,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install -e .
               bash .github/scripts/run_tests.sh 4
             " &
@@ -129,6 +134,8 @@ jobs:
 
       - name: Run 8-rank test
         run: |
+          set -e
+
           # Create unique overlay image for isolation
           OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
 
@@ -140,6 +147,7 @@ jobs:
           apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
             --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
             ~/apptainer/iris-dev.sif bash -c "
+              set -e
               pip install -e .
               bash .github/scripts/run_tests.sh 8
             "