Skip to content

Commit 9b4a4be

Browse files
committed
feat: test_train and test_model unit tests, e2e test
add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <cdoern@redhat.com>
1 parent a75ff0a commit 9b4a4be

File tree

9 files changed

+2080
-2
lines changed

9 files changed

+2080
-2
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 374 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,383 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
611
workflow_dispatch:
712
inputs:
813
pr_or_branch:
914
description: 'pull request number or branch name'
1015
required: true
1116
default: 'main'
12-
jobs:
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
24+
jobs:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+
steps:
32+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+
with:
35+
repository: instructlab/ci-actions
36+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+
path: ci-actions
38+
ref: release-v0.1
39+
sparse-checkout: |
40+
actions/launch-ec2-runner-with-fallback
41+
42+
- name: Launch EC2 Runner with Fallback
43+
id: launch-ec2-instance-with-fallback
44+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
45+
env:
46+
TMPDIR: "/tmp"
47+
with:
48+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+
regions_config: >
52+
[
53+
{
54+
"region": "us-east-2",
55+
"subnets": {
56+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+
},
60+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+
},
63+
{
64+
"region": "us-east-1",
65+
"subnets": {
66+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+
}
76+
]
77+
try_spot_instance_first: false
78+
ec2_instance_type: g6e.12xlarge
79+
aws_resource_tags: >
80+
[
81+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+
]
86+
87+
e2e-large-test:
88+
needs:
89+
- start-large-ec2-runner
90+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+
permissions:
93+
pull-requests: write
94+
95+
steps:
96+
- name: "Harden Runner"
97+
# v2.10.1
98+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+
with:
100+
egress-policy: audit
101+
- name: Install Packages
102+
run: |
103+
cat /etc/os-release
104+
mkdir -p "${TMPDIR}"
105+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+
- name: Checkout instructlab/training
108+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+
with:
110+
repository: "instructlab/training"
111+
path: "training"
112+
# https://github.com/actions/checkout/issues/249
113+
fetch-depth: 0
114+
115+
- name: Determine if pr_or_branch is a PR number
116+
id: check_pr
117+
run: |
118+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
119+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
120+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
121+
else
122+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
123+
fi
124+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
125+
126+
- name: Check if gh cli is installed
127+
id: gh_cli
128+
run: |
129+
if command -v gh &> /dev/null ; then
130+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
131+
else
132+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
133+
fi
134+
135+
- name: Install gh CLI
136+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
137+
run: |
138+
sudo dnf install 'dnf-command(config-manager)' -y
139+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
140+
sudo dnf install gh --repo gh-cli -y
141+
142+
- name: test gh CLI
143+
run: |
144+
gh --version
145+
146+
- name: set default repo
147+
working-directory: ./training
148+
run: |
149+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
150+
env:
151+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
152+
153+
- name: Add comment to PR
154+
if: steps.check_pr.outputs.is_pr == 'true'
155+
working-directory: ./training
156+
run: |
157+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
158+
env:
159+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
160+
161+
- name: Fetch and checkout PR
162+
if: steps.check_pr.outputs.is_pr == 'true'
163+
working-directory: ./training
164+
run: |
165+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
166+
env:
167+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
168+
169+
- name: Checkout branch
170+
if: steps.check_pr.outputs.is_pr == 'false'
171+
working-directory: ./training
172+
run: |
173+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
174+
175+
- name: Update instructlab-training library
176+
working-directory: ./training
177+
run: |
178+
export CUDA_HOME="/usr/local/cuda"
179+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
180+
export PATH="$PATH:$CUDA_HOME/bin"
181+
nvidia-smi
182+
python3.11 -m venv --upgrade-deps venv
183+
. venv/bin/activate
184+
pip install instructlab
185+
pip install instructlab[cuda]
186+
python3.11 -m pip install packaging wheel setuptools-scm
187+
pip install .
188+
pip install .[cuda]
189+
190+
- name: Check disk before tests
191+
run: |
192+
df -h
193+
194+
# TODO: switch to downloading a ds rather than generating one
195+
# - name: Download SDG Dataset
196+
# working-directory: ./training
197+
# uses: actions/download-artifact@v4
198+
# with:
199+
# name: sdg-dataset.jsonl
200+
# path: dataset
201+
202+
- name: Run e2e test
203+
working-directory: ./training
204+
env:
205+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
206+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
207+
run: |
208+
. venv/bin/activate
209+
210+
# set preserve to true so we can retain the logs
211+
./scripts/test_sdh.sh
212+
213+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
214+
# and we know that it will be written into a directory created by `mktemp -d`.
215+
# Given this information, we can use the following command to find the file:
216+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
217+
phase_num=1;
218+
for log_file in $log_files; do
219+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
220+
((phase_num++))
221+
done
222+
223+
- name: Check disk after tests
224+
run: |
225+
df -h
226+
227+
- name: Upload training logs Phase 1
228+
uses: actions/upload-artifact@v4
229+
with:
230+
name: phase-1-training-log.jsonl
231+
path: ./training/phase-1-training-log.jsonl
232+
retention-days: 1
233+
overwrite: true
234+
235+
- name: Upload training logs Phase 2
236+
uses: actions/upload-artifact@v4
237+
with:
238+
name: phase-2-training-log.jsonl
239+
path: ./training/phase-2-training-log.jsonl
240+
retention-days: 1
241+
overwrite: true
242+
243+
- name: Add comment to PR if the workflow failed
244+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
245+
working-directory: ./training
246+
run: |
247+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
248+
env:
249+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
250+
251+
- name: Add comment to PR if the workflow succeeded
252+
if: success() && steps.check_pr.outputs.is_pr == 'true'
253+
working-directory: ./training
254+
run: |
255+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
256+
env:
257+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
258+
259+
stop-large-ec2-runner:
260+
needs:
261+
- start-large-ec2-runner
262+
- e2e-large-test
263+
runs-on: ubuntu-latest
264+
if: ${{ always() }}
265+
steps:
266+
- name: "Harden Runner"
267+
# v2.10.1
268+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
269+
with:
270+
egress-policy: audit
271+
272+
- name: Configure AWS credentials
273+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
274+
with:
275+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
276+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
277+
aws-region: ${{ vars.AWS_REGION }}
278+
279+
- name: Stop EC2 runner
280+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
281+
with:
282+
mode: stop
283+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
284+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
285+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
286+
287+
loss-graphs:
288+
needs:
289+
- stop-large-ec2-runner
290+
runs-on: ubuntu-latest
291+
if: ${{ always() }}
292+
steps:
293+
- name: "Harden Runner"
294+
# v2.10.1
295+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
296+
with:
297+
egress-policy: audit
298+
299+
- name: Configure AWS credentials
300+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
301+
with:
302+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
303+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
304+
aws-region: ${{ vars.AWS_REGION }}
305+
306+
- name: Download loss data Phase 1
307+
id: phase-1-download-logs
308+
uses: actions/download-artifact@v4
309+
with:
310+
name: phase-1-training-log.jsonl
311+
path: downloaded-data
312+
313+
- name: Download loss data Phase 2
314+
id: phase-2-download-logs
315+
uses: actions/download-artifact@v4
316+
with:
317+
name: phase-2-training-log.jsonl
318+
path: downloaded-data
319+
320+
- name: Checkout instructlab/training
321+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
322+
with:
323+
repository: "instructlab/training"
324+
path: "training"
325+
fetch-depth: 0
326+
327+
- name: Install dependencies
328+
working-directory: ./training
329+
run: |
330+
python -m pip install --upgrade pip
331+
pip install -r requirements-dev.txt
332+
333+
- name: Try to upload Phase 1 to s3
334+
id: phase-1-upload-s3
335+
continue-on-error: true
336+
run: |
337+
python training/scripts/create-loss-graph.py \
338+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
339+
--output-file "./phase-1-test.md" \
340+
--phase "1" \
341+
--aws-region "${{ vars.AWS_REGION }}" \
342+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
343+
--base-branch "${GITHUB_REF##*/}" \
344+
--head-sha "${{ github.sha }}" \
345+
--pr-number "${{ github.event.number }}" \
346+
--origin-repository "${{ github.repository }}"
347+
348+
- name: Try to upload Phase 2 to s3
349+
id: phase-2-upload-s3
350+
continue-on-error: true
351+
run: |
352+
python training/scripts/create-loss-graph.py \
353+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
354+
--output-file "./phase-2-test.md" \
355+
--phase "2" \
356+
--aws-region "${{ vars.AWS_REGION }}" \
357+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
358+
--base-branch "${GITHUB_REF##*/}" \
359+
--head-sha "${{ github.sha }}" \
360+
--pr-number "${{ github.event.number }}" \
361+
--origin-repository "${{ github.repository }}"
362+
363+
- name: Check Phase 1 S3 upload status for success
364+
if: steps.phase-1-upload-s3.outcome == 'success'
365+
run: |
366+
echo "Uploaded Phase 1 loss graph to S3."
367+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
368+
369+
- name: Check Phase 2 S3 upload status for success
370+
if: steps.phase-2-upload-s3.outcome == 'success'
371+
run: |
372+
echo "Uploaded Phase 2 loss graph to S3."
373+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
374+
375+
- name: Check Phase 1 S3 upload status for failure
376+
if: steps.phase-1-upload-s3.outcome == 'failure'
377+
run: |
378+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
379+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
380+
381+
- name: Check Phase 2 S3 upload status for failure
382+
if: steps.phase-2-upload-s3.outcome == 'failure'
383+
run: |
384+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
385+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

0 commit comments

Comments
 (0)