Skip to content

Commit 04c6543

Browse files
committed
feat: test_train and test_model unit tests, e2e test
add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <cdoern@redhat.com>
1 parent a75ff0a commit 04c6543

File tree

9 files changed

+2030
-2
lines changed

9 files changed

+2030
-2
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 307 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,316 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
611
workflow_dispatch:
712
inputs:
813
pr_or_branch:
914
description: 'pull request number or branch name'
1015
required: true
1116
default: 'main'
12-
jobs:
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
24+
jobs:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+
steps:
32+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+
with:
35+
repository: instructlab/ci-actions
36+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+
path: ci-actions
38+
ref: release-v0.1
39+
sparse-checkout: |
40+
actions/launch-ec2-runner-with-fallback
41+
42+
- name: Launch EC2 Runner with Fallback
43+
id: launch-ec2-instance-with-fallback
44+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
45+
env:
46+
TMPDIR: "/tmp"
47+
with:
48+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+
regions_config: >
52+
[
53+
{
54+
"region": "us-east-2",
55+
"subnets": {
56+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+
},
60+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+
},
63+
{
64+
"region": "us-east-1",
65+
"subnets": {
66+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+
}
76+
]
77+
try_spot_instance_first: false
78+
ec2_instance_type: g6e.12xlarge
79+
aws_resource_tags: >
80+
[
81+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+
]
86+
87+
e2e-large-test:
88+
needs:
89+
- start-large-ec2-runner
90+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+
permissions:
93+
pull-requests: write
94+
95+
steps:
96+
- name: "Harden Runner"
97+
# v2.10.1
98+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+
with:
100+
egress-policy: audit
101+
- name: Install Packages
102+
run: |
103+
cat /etc/os-release
104+
mkdir -p "${TMPDIR}"
105+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+
- name: Checkout
108+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+
with:
110+
# https://github.com/actions/checkout/issues/249
111+
fetch-depth: 0
112+
113+
- name: Install dependent PRs if needed
114+
uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115+
with:
116+
token: ${{ secrets.GITHUB_TOKEN }}
117+
118+
- name: Fetch and checkout PR
119+
if: ${{ github.event_name == 'pull_request_target' }}
120+
run: |
121+
git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122+
git checkout pr-${{ github.event.number }}
123+
124+
- name: Update instructlab-training library
125+
run: |
126+
export CUDA_HOME="/usr/local/cuda"
127+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128+
export PATH="$PATH:$CUDA_HOME/bin"
129+
nvidia-smi
130+
python3.11 -m venv --upgrade-deps venv
131+
. venv/bin/activate
132+
pip install instructlab
133+
pip install instructlab[cuda]
134+
python3.11 -m pip install packaging wheel setuptools-scm
135+
pip install .
136+
pip install .[cuda]
137+
python3.11 -m pip uninstall -y flash-attn
138+
python3.11 -m pip cache purge
139+
python3.11 -m pip install ninja
140+
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
141+
142+
- name: Check disk before tests
143+
run: |
144+
df -h
145+
146+
# TODO: switch to downloading a ds rather than generating one
147+
# - name: Download SDG Dataset
148+
# working-directory: ./training
149+
# uses: actions/download-artifact@v4
150+
# with:
151+
# name: sdg-dataset.jsonl
152+
# path: dataset
153+
154+
- name: Run e2e test
155+
env:
156+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
157+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
158+
run: |
159+
. venv/bin/activate
160+
ls scripts
161+
ls ./
162+
./scripts/test-sdk.sh
163+
164+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
165+
# and we know that it will be written into a directory created by `mktemp -d`.
166+
# Given this information, we can use the following command to find the file:
167+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
168+
phase_num=1;
169+
for log_file in $log_files; do
170+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
171+
((phase_num++))
172+
done
173+
174+
- name: Check disk after tests
175+
run: |
176+
df -h
177+
178+
- name: Upload training logs Phase 1
179+
uses: actions/upload-artifact@v4
180+
with:
181+
name: phase-1-training-log.jsonl
182+
path: ./phase-1-training-log.jsonl
183+
retention-days: 1
184+
overwrite: true
185+
186+
- name: Upload training logs Phase 2
187+
uses: actions/upload-artifact@v4
188+
with:
189+
name: phase-2-training-log.jsonl
190+
path: ./phase-2-training-log.jsonl
191+
retention-days: 1
192+
overwrite: true
193+
194+
stop-large-ec2-runner:
195+
needs:
196+
- start-large-ec2-runner
197+
- e2e-large-test
198+
runs-on: ubuntu-latest
199+
if: ${{ always() }}
200+
steps:
201+
- name: "Harden Runner"
202+
# v2.10.1
203+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
204+
with:
205+
egress-policy: audit
206+
207+
- name: Configure AWS credentials
208+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
209+
with:
210+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
211+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
212+
aws-region: ${{ vars.AWS_REGION }}
213+
214+
- name: Stop EC2 runner
215+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
216+
with:
217+
mode: stop
218+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
219+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
220+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
221+
222+
loss-graphs:
223+
needs:
224+
- stop-large-ec2-runner
225+
runs-on: ubuntu-latest
226+
if: ${{ always() }}
227+
steps:
228+
- name: "Harden Runner"
229+
# v2.10.1
230+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
231+
with:
232+
egress-policy: audit
233+
234+
- name: Configure AWS credentials
235+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
236+
with:
237+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
238+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
239+
aws-region: ${{ vars.AWS_REGION }}
240+
241+
- name: Download loss data Phase 1
242+
id: phase-1-download-logs
243+
uses: actions/download-artifact@v4
244+
with:
245+
name: phase-1-training-log.jsonl
246+
path: downloaded-data
247+
248+
- name: Download loss data Phase 2
249+
id: phase-2-download-logs
250+
uses: actions/download-artifact@v4
251+
with:
252+
name: phase-2-training-log.jsonl
253+
path: downloaded-data
254+
255+
- name: Checkout
256+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
257+
with:
258+
# https://github.com/actions/checkout/issues/249
259+
fetch-depth: 0
260+
261+
- name: Install dependencies
262+
run: |
263+
python -m pip install --upgrade pip
264+
pip install -r requirements-dev.txt
265+
266+
- name: Try to upload Phase 1 to s3
267+
id: phase-1-upload-s3
268+
continue-on-error: true
269+
run: |
270+
python ./scripts/create-loss-graph.py \
271+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
272+
--output-file "./phase-1-test.md" \
273+
--phase "1" \
274+
--aws-region "${{ vars.AWS_REGION }}" \
275+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
276+
--base-branch "${GITHUB_REF##*/}" \
277+
--head-sha "${{ github.sha }}" \
278+
--pr-number "${{ github.event.number }}" \
279+
--origin-repository "${{ github.repository }}"
280+
281+
- name: Try to upload Phase 2 to s3
282+
id: phase-2-upload-s3
283+
continue-on-error: true
284+
run: |
285+
python ./scripts/create-loss-graph.py \
286+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
287+
--output-file "./phase-2-test.md" \
288+
--phase "2" \
289+
--aws-region "${{ vars.AWS_REGION }}" \
290+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
291+
--base-branch "${GITHUB_REF##*/}" \
292+
--head-sha "${{ github.sha }}" \
293+
--pr-number "${{ github.event.number }}" \
294+
--origin-repository "${{ github.repository }}"
295+
296+
- name: Check Phase 1 S3 upload status for success
297+
if: steps.phase-1-upload-s3.outcome == 'success'
298+
run: |
299+
echo "Uploaded Phase 1 loss graph to S3."
300+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
301+
302+
- name: Check Phase 2 S3 upload status for success
303+
if: steps.phase-2-upload-s3.outcome == 'success'
304+
run: |
305+
echo "Uploaded Phase 2 loss graph to S3."
306+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
307+
308+
- name: Check Phase 1 S3 upload status for failure
309+
if: steps.phase-1-upload-s3.outcome == 'failure'
310+
run: |
311+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
312+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
313+
314+
- name: Check Phase 2 S3 upload status for failure
315+
if: steps.phase-2-upload-s3.outcome == 'failure'
316+
run: |
317+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
318+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

scripts/ibm_legacy_tmpl.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# First Party
4+
from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
5+
6+
SPECIAL_TOKENS = SpecialTokens(
7+
system=TokenInfo("<|system|>", add_to_tokenizer=True),
8+
user=TokenInfo("<|user|>", add_to_tokenizer=True),
9+
assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
10+
eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
11+
pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
12+
bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
13+
)
14+
15+
CHAT_TEMPLATE = (
16+
"{% for message in messages %}"
17+
"{% if message['role'] == 'pretraining' %}"
18+
"{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
19+
"{% elif message['role'] == 'system' %}"
20+
"{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
21+
"{% elif message['role'] == 'user' %}"
22+
"{{'<|user|>' + '\n' + message['content'] + '\n'}}"
23+
"{% elif message['role'] == 'assistant' %}"
24+
"{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
25+
"{% endif %}"
26+
"{% if loop.last and add_generation_prompt %}"
27+
"{{ '<|assistant|>' + '\n' }}"
28+
"{% endif %}"
29+
"{% endfor %}"
30+
)

0 commit comments

Comments
 (0)