Skip to content

Commit f4bf4b2

Browse files
committed
feat: test_train and test_model unit tests, e2e test
add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <cdoern@redhat.com>
1 parent 0c75f67 commit f4bf4b2

File tree

9 files changed

+2067
-1
lines changed

9 files changed

+2067
-1
lines changed
Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: E2E (NVIDIA L40S x4) SDK Test
4+
5+
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
11+
workflow_dispatch:
12+
inputs:
13+
pr_or_branch:
14+
description: 'pull request number or branch name'
15+
required: true
16+
default: 'main'
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
24+
jobs:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.start-ec2-runner.outputs.label }}
29+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
30+
steps:
31+
- name: "Harden Runner"
32+
# v2.10.1
33+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
34+
with:
35+
egress-policy: audit
36+
37+
- name: Configure AWS credentials
38+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
39+
with:
40+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
41+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
42+
aws-region: ${{ vars.AWS_REGION }}
43+
44+
- name: Start EC2 runner
45+
id: start-ec2-runner
46+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
47+
with:
48+
mode: start
49+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
50+
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
51+
ec2-instance-type: g6e.12xlarge
52+
subnet-id: subnet-024298cefa3bedd61
53+
security-group-id: sg-06300447c4a5fbef3
54+
iam-role-name: instructlab-ci-runner
55+
aws-resource-tags: >
56+
[
57+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
58+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
59+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
60+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
61+
]
62+
63+
e2e-large-test:
64+
needs:
65+
- start-large-ec2-runner
66+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
67+
68+
permissions:
69+
pull-requests: write
70+
71+
steps:
72+
- name: "Harden Runner"
73+
# v2.10.1
74+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
75+
with:
76+
egress-policy: audit
77+
- name: Install Packages
78+
run: |
79+
cat /etc/os-release
80+
mkdir -p "${TMPDIR}"
81+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
82+
83+
- name: Checkout instructlab/training
84+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
85+
with:
86+
repository: "instructlab/training"
87+
path: "training"
88+
# https://github.com/actions/checkout/issues/249
89+
fetch-depth: 0
90+
91+
- name: Determine if pr_or_branch is a PR number
92+
id: check_pr
93+
run: |
94+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
95+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
96+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
97+
else
98+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
99+
fi
100+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
101+
102+
- name: Check if gh cli is installed
103+
id: gh_cli
104+
run: |
105+
if command -v gh &> /dev/null ; then
106+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
107+
else
108+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
109+
fi
110+
111+
- name: Install gh CLI
112+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
113+
run: |
114+
sudo dnf install 'dnf-command(config-manager)' -y
115+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
116+
sudo dnf install gh --repo gh-cli -y
117+
118+
- name: test gh CLI
119+
run: |
120+
gh --version
121+
122+
- name: set default repo
123+
working-directory: ./training
124+
run: |
125+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
126+
env:
127+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
128+
129+
- name: Add comment to PR
130+
if: steps.check_pr.outputs.is_pr == 'true'
131+
working-directory: ./training
132+
run: |
133+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
134+
env:
135+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
136+
137+
- name: Fetch and checkout PR
138+
if: steps.check_pr.outputs.is_pr == 'true'
139+
working-directory: ./training
140+
run: |
141+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
142+
env:
143+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
144+
145+
- name: Checkout branch
146+
if: steps.check_pr.outputs.is_pr == 'false'
147+
working-directory: ./training
148+
run: |
149+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
150+
151+
- name: Update instructlab-training library
152+
working-directory: ./training
153+
run: |
154+
export CUDA_HOME="/usr/local/cuda"
155+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
156+
export PATH="$PATH:$CUDA_HOME/bin"
157+
nvidia-smi
158+
python3.11 -m venv --upgrade-deps venv
159+
. venv/bin/activate
160+
pip install instructlab
161+
pip install instructlab[cuda]
162+
python3.11 -m pip install packaging wheel setuptools-scm
163+
pip install .
164+
pip install .[cuda]
165+
166+
- name: Check disk before tests
167+
run: |
168+
df -h
169+
170+
# TODO: switch to downloading a ds rather than generating one
171+
# - name: Download SDG Dataset
172+
# working-directory: ./training
173+
# uses: actions/download-artifact@v4
174+
# with:
175+
# name: sdg-dataset.jsonl
176+
# path: dataset
177+
178+
- name: Run e2e test
179+
working-directory: ./training
180+
env:
181+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
182+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
183+
run: |
184+
. venv/bin/activate
185+
186+
# set preserve to true so we can retain the logs
187+
./scripts/test_sdh.sh
188+
189+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
190+
# and we know that it will be written into a directory created by `mktemp -d`.
191+
# Given this information, we can use the following command to find the file:
192+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
193+
phase_num=1;
194+
for log_file in $log_files; do
195+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
196+
((phase_num++))
197+
done
198+
199+
- name: Check disk after tests
200+
run: |
201+
df -h
202+
203+
- name: Upload training logs Phase 1
204+
uses: actions/upload-artifact@v4
205+
with:
206+
name: phase-1-training-log.jsonl
207+
path: ./training/phase-1-training-log.jsonl
208+
retention-days: 1
209+
overwrite: true
210+
211+
- name: Upload training logs Phase 2
212+
uses: actions/upload-artifact@v4
213+
with:
214+
name: phase-2-training-log.jsonl
215+
path: ./training/phase-2-training-log.jsonl
216+
retention-days: 1
217+
overwrite: true
218+
219+
- name: Add comment to PR if the workflow failed
220+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
221+
working-directory: ./training
222+
run: |
223+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
224+
env:
225+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
226+
227+
- name: Add comment to PR if the workflow succeeded
228+
if: success() && steps.check_pr.outputs.is_pr == 'true'
229+
working-directory: ./training
230+
run: |
231+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
232+
env:
233+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
234+
235+
stop-large-ec2-runner:
236+
needs:
237+
- start-large-ec2-runner
238+
- e2e-large-test
239+
runs-on: ubuntu-latest
240+
if: ${{ always() }}
241+
steps:
242+
- name: "Harden Runner"
243+
# v2.10.1
244+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
245+
with:
246+
egress-policy: audit
247+
248+
- name: Configure AWS credentials
249+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
250+
with:
251+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
252+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
253+
aws-region: ${{ vars.AWS_REGION }}
254+
255+
- name: Stop EC2 runner
256+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
257+
with:
258+
mode: stop
259+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
260+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
261+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
262+
263+
loss-graphs:
264+
needs:
265+
- stop-large-ec2-runner
266+
runs-on: ubuntu-latest
267+
if: ${{ always() }}
268+
steps:
269+
- name: "Harden Runner"
270+
# v2.10.1
271+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
272+
with:
273+
egress-policy: audit
274+
275+
- name: Configure AWS credentials
276+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
277+
with:
278+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
279+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
280+
aws-region: ${{ vars.AWS_REGION }}
281+
282+
- name: Download loss data Phase 1
283+
id: phase-1-download-logs
284+
uses: actions/download-artifact@v4
285+
with:
286+
name: phase-1-training-log.jsonl
287+
path: downloaded-data
288+
289+
- name: Download loss data Phase 2
290+
id: phase-2-download-logs
291+
uses: actions/download-artifact@v4
292+
with:
293+
name: phase-2-training-log.jsonl
294+
path: downloaded-data
295+
296+
- name: Checkout instructlab/training
297+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
298+
with:
299+
repository: "instructlab/training"
300+
path: "training"
301+
fetch-depth: 0
302+
303+
- name: Install dependencies
304+
working-directory: ./training
305+
run: |
306+
python -m pip install --upgrade pip
307+
pip install -r requirements-dev.txt
308+
309+
- name: Try to upload Phase 1 to s3
310+
id: phase-1-upload-s3
311+
continue-on-error: true
312+
run: |
313+
python training/scripts/create-loss-graph.py \
314+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
315+
--output-file "./phase-1-test.md" \
316+
--phase "1" \
317+
--aws-region "${{ vars.AWS_REGION }}" \
318+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
319+
--base-branch "${GITHUB_REF##*/}" \
320+
--head-sha "${{ github.sha }}" \
321+
--pr-number "${{ github.event.number }}" \
322+
--origin-repository "${{ github.repository }}"
323+
324+
- name: Try to upload Phase 2 to s3
325+
id: phase-2-upload-s3
326+
continue-on-error: true
327+
run: |
328+
python training/scripts/create-loss-graph.py \
329+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
330+
--output-file "./phase-2-test.md" \
331+
--phase "2" \
332+
--aws-region "${{ vars.AWS_REGION }}" \
333+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
334+
--base-branch "${GITHUB_REF##*/}" \
335+
--head-sha "${{ github.sha }}" \
336+
--pr-number "${{ github.event.number }}" \
337+
--origin-repository "${{ github.repository }}"
338+
339+
- name: Check Phase 1 S3 upload status for success
340+
if: steps.phase-1-upload-s3.outcome == 'success'
341+
run: |
342+
echo "Uploaded Phase 1 loss graph to S3."
343+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
344+
345+
- name: Check Phase 2 S3 upload status for success
346+
if: steps.phase-2-upload-s3.outcome == 'success'
347+
run: |
348+
echo "Uploaded Phase 2 loss graph to S3."
349+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
350+
351+
- name: Check Phase 1 S3 upload status for failure
352+
if: steps.phase-1-upload-s3.outcome == 'failure'
353+
run: |
354+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
355+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
356+
357+
- name: Check Phase 2 S3 upload status for failure
358+
if: steps.phase-2-upload-s3.outcome == 'failure'
359+
run: |
360+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
361+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

scripts/ibm_legacy_tmpl.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# First Party
4+
from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
5+
6+
SPECIAL_TOKENS = SpecialTokens(
7+
system=TokenInfo("<|system|>", add_to_tokenizer=True),
8+
user=TokenInfo("<|user|>", add_to_tokenizer=True),
9+
assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
10+
eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
11+
pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
12+
bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
13+
)
14+
15+
CHAT_TEMPLATE = (
16+
"{% for message in messages %}"
17+
"{% if message['role'] == 'pretraining' %}"
18+
"{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
19+
"{% elif message['role'] == 'system' %}"
20+
"{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
21+
"{% elif message['role'] == 'user' %}"
22+
"{{'<|user|>' + '\n' + message['content'] + '\n'}}"
23+
"{% elif message['role'] == 'assistant' %}"
24+
"{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
25+
"{% endif %}"
26+
"{% if loop.last and add_generation_prompt %}"
27+
"{{ '<|assistant|>' + '\n' }}"
28+
"{% endif %}"
29+
"{% endfor %}"
30+
)

0 commit comments

Comments
 (0)