Skip to content

Commit 3d8bf12

Browse files
committed
ci: add 3.12 smoke workflow flavor
Yes, it's a lot of duplication. Until we have some way to generate workflows from a template, we have to have it: matrix doesn't apply because if used, it will use the same single ec2 runner for both 3.11 and 3.12 runs - and while it works, it slows down feedback almost by x2 because these are run sequentially. Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
1 parent 21260dc commit 3d8bf12

File tree

2 files changed

+204
-1
lines changed

2 files changed

+204
-1
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3-
name: "Run smoke tests via Tox::pytest"
3+
name: "Run smoke tests via Tox::pytest (python 3.11)"
44
# These tests will be long running and require accelerated hardware.
55

66
on:

.github/workflows/smoke-py312.yaml

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: "Run smoke tests via Tox::pytest (python 3.12)"
4+
# These tests will be long running and require accelerated hardware.
5+
6+
on:
7+
workflow_dispatch:
8+
inputs:
9+
branch:
10+
type: string
11+
default: main
12+
# using this rather than pull_request because this workflow
13+
# needs to run in the context of the base branch (main) and
14+
# access the repo's secrets to start the AWS instances.
15+
pull_request_target:
16+
branches:
17+
- main
18+
- release-*
19+
paths:
20+
# note this should match the merging criteria in 'mergify.yml'
21+
- "**.py"
22+
- "tox.ini"
23+
- "pyproject.toml"
24+
- "requirements-dev.txt"
25+
- "requirements-cuda.txt"
26+
27+
permissions:
28+
contents: read
29+
30+
defaults:
31+
run:
32+
shell: bash
33+
34+
env:
35+
ec2_runner_variant: "g6e.12xlarge" # 4x L40s
36+
37+
jobs:
38+
start-large-ec2-runner:
39+
runs-on: ubuntu-latest
40+
outputs:
41+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
42+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
43+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
44+
steps:
45+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
46+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
47+
with:
48+
repository: instructlab/ci-actions
49+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
50+
path: ci-actions
51+
ref: release-v0.1
52+
sparse-checkout: |
53+
actions/launch-ec2-runner-with-fallback
54+
55+
- name: Launch EC2 Runner with Fallback
56+
id: launch-ec2-instance-with-fallback
57+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
58+
env:
59+
TMPDIR: "/tmp"
60+
with:
61+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
62+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
63+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
64+
regions_config: >
65+
[
66+
{
67+
"region": "us-east-2",
68+
"subnets": {
69+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
70+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
71+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
75+
},
76+
{
77+
"region": "us-east-1",
78+
"subnets": {
79+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
80+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
81+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
82+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
83+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
84+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
85+
},
86+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
87+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
88+
}
89+
]
90+
try_spot_instance_first: false
91+
ec2_instance_type: g6e.12xlarge
92+
aws_resource_tags: >
93+
[
94+
{"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
95+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
96+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
97+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
98+
]
99+
100+
run-smoke-tests:
101+
needs:
102+
- start-large-ec2-runner
103+
runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
104+
# It is important that this job has no write permissions and has
105+
# no access to any secrets. This part is where we are running
106+
# untrusted code from PRs.
107+
permissions: {}
108+
steps:
109+
- name: "Harden runner"
110+
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
111+
with:
112+
egress-policy: audit
113+
114+
- name: "Install packages"
115+
run: |
116+
cat /etc/os-release
117+
sudo dnf install -y gcc gcc-c++ make git-core python3.12 python3.12-devel
118+
119+
- name: "Verify cuda environment is setup"
120+
run: |
121+
export CUDA_HOME="/usr/local/cuda"
122+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
123+
export PATH="${PATH}:${CUDA_HOME}/bin"
124+
nvidia-smi
125+
126+
- name: "Checkout code"
127+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
128+
with:
129+
fetch-depth: 0
130+
ref: ${{inputs.branch}}
131+
132+
# installs in $GITHUB_WORKSPACE/venv.
133+
# only has to install Tox because Tox will do the other virtual environment management.
134+
- name: "Setup Python virtual environment"
135+
run: |
136+
python3.12 -m venv --upgrade-deps venv
137+
. venv/bin/activate
138+
pip install tox
139+
140+
# flash-attn has a bug in the setup.py that causes pip to attempt
141+
# installing it before torch is installed. This is a bug because their
142+
# setup.py depends on importing the module, so it should have been listed
143+
# in build_requires. Alas.
144+
# See: https://github.com/Dao-AILab/flash-attention/pull/958
145+
- name: "Install torch and other unlisted build dependencies for flash-attn"
146+
run: |
147+
source venv/bin/activate
148+
# The list is taken from the pull request linked above
149+
pip install torch packaging setuptools wheel psutil ninja
150+
151+
- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
152+
run: |
153+
source venv/bin/activate
154+
pip install tox-current-env
155+
156+
- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
157+
run: |
158+
source venv/bin/activate
159+
tox -e py3-smoke --print-deps-to-file=./deps.txt
160+
pip install -r ./deps.txt --no-build-isolation
161+
pip install .
162+
163+
- name: "Show disk utilization BEFORE tests"
164+
if: always()
165+
run: |
166+
df -h
167+
168+
- name: "Run smoke tests with Tox and Pytest"
169+
run: |
170+
source venv/bin/activate
171+
tox --current-env -e py3-smoke
172+
173+
- name: "Show disk utilization AFTER tests"
174+
if: always()
175+
run: |
176+
df -h
177+
178+
stop-large-ec2-runner:
179+
needs:
180+
- start-large-ec2-runner
181+
- run-smoke-tests
182+
runs-on: ubuntu-latest
183+
if: ${{ always() }}
184+
steps:
185+
- name: "Harden runner"
186+
uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.10.1
187+
with:
188+
egress-policy: audit
189+
190+
- name: "Configure AWS credentials"
191+
uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
192+
with:
193+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
194+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
195+
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
196+
197+
- name: "Stop EC2 runner"
198+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
199+
with:
200+
mode: stop
201+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
202+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
203+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

0 commit comments

Comments
 (0)