instructlab · RobotSail · May 21, 2025 · May 6, 2025
diff --git a/.github/actions/run-smoke/action.yml b/.github/actions/run-smoke/action.yml
@@ -0,0 +1,81 @@
+name: 'Run smoke tests'
+description: 'Runs smoke tests'
+inputs:
+  python-version:
+    required: true
+    description: >-
+      Python version to use. Must be in the form of "3.xx".
+runs:
+  using: "composite"
+  steps:
+    - name: "Harden runner"
+      uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
+      with:
+        egress-policy: audit
+
+    - name: "Install packages"
+      shell: bash
+      run: |
+        cat /etc/os-release
+        sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
+
+    - name: "Verify cuda environment is setup"
+      shell: bash
+      run: |
+        export CUDA_HOME="/usr/local/cuda"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
+        export PATH="${PATH}:${CUDA_HOME}/bin"
+        nvidia-smi
+
+    # installs in $GITHUB_WORKSPACE/venv.
+    # only has to install Tox because Tox will do the other virtual environment management.
+    - name: "Setup Python virtual environment"
+      shell: bash
+      run: |
+        python${{ inputs.python-version }} -m venv --upgrade-deps venv
+        . venv/bin/activate
+        pip install tox
+
+    # flash-attn has a bug in the setup.py that causes pip to attempt
+    # installing it before torch is installed. This is a bug because their
+    # setup.py depends on importing the module, so it should have been listed
+    # in build_requires. Alas. See:
+    # https://github.com/Dao-AILab/flash-attention/pull/958
+    - name: "Install torch and other unlisted build dependencies for flash-attn"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        # The list is taken from the pull request linked above
+        pip install torch packaging setuptools wheel psutil ninja
+
+    - name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        pip install tox-current-env
+
+    - name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox -e py3-smoke --print-deps-to-file=./deps.txt
+        pip install -r ./deps.txt --no-build-isolation
+        pip install .
+
+    - name: "Show disk utilization BEFORE tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
+
+    - name: "Run smoke tests with Tox and Pytest"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox --current-env -e py3-smoke
+
+    - name: "Show disk utilization AFTER tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
diff --git a/.github/workflows/smoke-py312.yaml b/.github/workflows/smoke-py312.yaml
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: "Run smoke tests via Tox::pytest (python 3.12)"
+# These tests will be long running and require accelerated hardware.
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        type: string
+        default: main
+  # using this rather than pull_request because this workflow
+  # needs to run in the context of the base branch (main) and
+  # access the repo's secrets to start the AWS instances.
+  pull_request_target:
+    branches:
+      - main
+      - release-*
+    paths:
+      # note this should match the merging criteria in 'mergify.yml'
+      - "**.py"
+      - "tox.ini"
+      - "pyproject.toml"
+      - "requirements-dev.txt"
+      - "requirements-cuda.txt"
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  ec2_runner_variant: "g6e.12xlarge" # 4x L40s
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
+    steps:
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
+        with:
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  run-smoke-tests:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{inputs.branch}}
+
+      - name: Run smoke tests
+        uses: ./.github/actions/run-smoke
+        with:
+          python-version: 3.12
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - run-smoke-tests
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
+
+      - name: "Stop EC2 runner"
+        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: "Run smoke tests via Tox::pytest"
+name: "Run smoke tests via Tox::pytest (python 3.11)"
 # These tests will be long running and require accelerated hardware.
 
 on:
@@ -106,74 +106,16 @@ jobs:
     # untrusted code from PRs.
     permissions: {}
     steps:
-      - name: "Harden runner"
-        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
-        with:
-          egress-policy: audit
-
-      - name: "Install packages"
-        run: |
-          cat /etc/os-release
-          sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
-
-      - name: "Verify cuda environment is setup"
-        run: |
-          export CUDA_HOME="/usr/local/cuda"
-          export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
-          export PATH="${PATH}:${CUDA_HOME}/bin"
-          nvidia-smi
-
       - name: "Checkout code"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
           ref: ${{inputs.branch}}
 
-      # installs in $GITHUB_WORKSPACE/venv.
-      # only has to install Tox because Tox will do the other virtual environment management.
-      - name: "Setup Python virtual environment"
-        run: |
-          python3.11 -m venv --upgrade-deps venv
-          . venv/bin/activate
-          pip install tox
-
-      # flash-attn has a bug in the setup.py that causes pip to attempt
-      # installing it before torch is installed. This is a bug because their
-      # setup.py depends on importing the module, so it should have been listed
-      # in build_requires. Alas.
-      # See: https://github.com/Dao-AILab/flash-attention/pull/958
-      - name: "Install torch and other unlisted build dependencies for flash-attn"
-        run: |
-          source venv/bin/activate
-          # The list is taken from the pull request linked above
-          pip install torch packaging setuptools wheel psutil ninja
-
-      - name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
-        run: |
-          source venv/bin/activate
-          pip install tox-current-env
-
-      - name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
-        run: |
-          source venv/bin/activate
-          tox -e py3-smoke --print-deps-to-file=./deps.txt
-          pip install -r ./deps.txt --no-build-isolation
-          pip install .
-
-      - name: "Show disk utilization BEFORE tests"
-        if: always()
-        run: |
-          df -h
-
-      - name: "Run smoke tests with Tox and Pytest"
-        run: |
-          source venv/bin/activate
-          tox --current-env -e py3-smoke
-
-      - name: "Show disk utilization AFTER tests"
-        if: always()
-        run: |
-          df -h
+      - name: Run smoke tests
+        uses: ./.github/actions/run-smoke
+        with:
+          python-version: 3.11
 
   stop-large-ec2-runner:
     needs: