Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions .github/actions/run-smoke/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: 'Run smoke tests'
description: 'Runs smoke tests'
inputs:
python-version:
required: true
description: >-
Python version to use. Must be in the form of "3.xx".
runs:
using: "composite"
steps:
- name: "Harden runner"
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
with:
egress-policy: audit

- name: "Install packages"
shell: bash
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel

- name: "Verify cuda environment is setup"
shell: bash
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
export PATH="${PATH}:${CUDA_HOME}/bin"
nvidia-smi

# installs in $GITHUB_WORKSPACE/venv.
# only has to install Tox because Tox will do the other virtual environment management.
- name: "Setup Python virtual environment"
shell: bash
run: |
python${{ inputs.python-version }} -m venv --upgrade-deps venv
. venv/bin/activate
pip install tox

# flash-attn has a bug in the setup.py that causes pip to attempt
# installing it before torch is installed. This is a bug because their
# setup.py depends on importing the module, so it should have been listed
# in build_requires. Alas. See:
# https://github.com/Dao-AILab/flash-attention/pull/958
- name: "Install torch and other unlisted build dependencies for flash-attn"
shell: bash
run: |
source venv/bin/activate
# The list is taken from the pull request linked above
pip install torch packaging setuptools wheel psutil ninja

- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
shell: bash
run: |
source venv/bin/activate
pip install tox-current-env

- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
shell: bash
run: |
source venv/bin/activate
tox -e py3-smoke --print-deps-to-file=./deps.txt
pip install -r ./deps.txt --no-build-isolation
pip install .

- name: "Show disk utilization BEFORE tests"
shell: bash
if: always()
run: |
df -h

- name: "Run smoke tests with Tox and Pytest"
shell: bash
run: |
source venv/bin/activate
tox --current-env -e py3-smoke

- name: "Show disk utilization AFTER tests"
shell: bash
if: always()
run: |
df -h
145 changes: 145 additions & 0 deletions .github/workflows/smoke-py312.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# SPDX-License-Identifier: Apache-2.0

name: "Run smoke tests via Tox::pytest (python 3.12)"
# These tests will be long running and require accelerated hardware.

on:
workflow_dispatch:
inputs:
branch:
type: string
default: main
# using this rather than pull_request because this workflow
# needs to run in the context of the base branch (main) and
# access the repo's secrets to start the AWS instances.
pull_request_target:
branches:
- main
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- "**.py"
- "tox.ini"
- "pyproject.toml"
- "requirements-dev.txt"
- "requirements-cuda.txt"

permissions:
contents: read

defaults:
run:
shell: bash

env:
ec2_runner_variant: "g6e.12xlarge" # 4x L40s

jobs:
start-large-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback

- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g6e.12xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]

run-smoke-tests:
needs:
- start-large-ec2-runner
runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
# It is important that this job has no write permissions and has
# no access to any secrets. This part is where we are running
# untrusted code from PRs.
permissions: {}
steps:
- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
ref: ${{inputs.branch}}

- name: Run smoke tests
uses: ./.github/actions/run-smoke
with:
python-version: 3.12

stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- run-smoke-tests
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
with:
egress-policy: audit

- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}

- name: "Stop EC2 runner"
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
68 changes: 5 additions & 63 deletions .github/workflows/smoke.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0

name: "Run smoke tests via Tox::pytest"
name: "Run smoke tests via Tox::pytest (python 3.11)"
# These tests will be long running and require accelerated hardware.

on:
Expand Down Expand Up @@ -106,74 +106,16 @@ jobs:
# untrusted code from PRs.
permissions: {}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
with:
egress-policy: audit

- name: "Install packages"
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel

- name: "Verify cuda environment is setup"
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
export PATH="${PATH}:${CUDA_HOME}/bin"
nvidia-smi

- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
ref: ${{inputs.branch}}

# installs in $GITHUB_WORKSPACE/venv.
# only has to install Tox because Tox will do the other virtual environment management.
- name: "Setup Python virtual environment"
run: |
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
pip install tox

# flash-attn has a bug in the setup.py that causes pip to attempt
# installing it before torch is installed. This is a bug because their
# setup.py depends on importing the module, so it should have been listed
# in build_requires. Alas.
# See: https://github.com/Dao-AILab/flash-attention/pull/958
- name: "Install torch and other unlisted build dependencies for flash-attn"
run: |
source venv/bin/activate
# The list is taken from the pull request linked above
pip install torch packaging setuptools wheel psutil ninja

- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
run: |
source venv/bin/activate
pip install tox-current-env

- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
run: |
source venv/bin/activate
tox -e py3-smoke --print-deps-to-file=./deps.txt
pip install -r ./deps.txt --no-build-isolation
pip install .

- name: "Show disk utilization BEFORE tests"
if: always()
run: |
df -h

- name: "Run smoke tests with Tox and Pytest"
run: |
source venv/bin/activate
tox --current-env -e py3-smoke

- name: "Show disk utilization AFTER tests"
if: always()
run: |
df -h
- name: Run smoke tests
uses: ./.github/actions/run-smoke
with:
python-version: 3.11

stop-large-ec2-runner:
needs:
Expand Down
Loading