33name : E2E (NVIDIA L40S x4) SDK Test
44
55on :
6+ pull_request :
7+ branches :
8+ - " main"
9+ schedule :
10+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
611 workflow_dispatch :
712 inputs :
813 pr_or_branch :
914 description : ' pull request number or branch name'
1015 required : true
1116 default : ' main'
12- jobs:
17+ concurrency :
18+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+ cancel-in-progress : true
20+
21+ env :
22+ TMPDIR : /home/tmp
23+
24+ jobs :
25+ start-large-ec2-runner :
26+ runs-on : ubuntu-latest
27+ outputs :
28+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+ steps :
32+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+ with :
35+ repository : instructlab/ci-actions
36+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+ path : ci-actions
38+ ref : release-v0.1
39+ sparse-checkout : |
40+ actions/launch-ec2-runner-with-fallback
41+
42+ - name : Launch EC2 Runner with Fallback
43+ id : launch-ec2-instance-with-fallback
44+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45+ env :
46+ TMPDIR : " /tmp"
47+ with :
48+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+ regions_config : >
52+ [
53+ {
54+ "region": "us-east-2",
55+ "subnets": {
56+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+ },
60+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+ },
63+ {
64+ "region": "us-east-1",
65+ "subnets": {
66+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+ },
73+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+ }
76+ ]
77+ try_spot_instance_first : false
78+ ec2_instance_type : g6e.12xlarge
79+ aws_resource_tags : >
80+ [
81+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+ ]
86+
87+ e2e-large-test :
88+ needs :
89+ - start-large-ec2-runner
90+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+ permissions :
93+ pull-requests : write
94+
95+ steps :
96+ - name : " Harden Runner"
97+ # v2.10.1
98+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+ with :
100+ egress-policy : audit
101+ - name : Install Packages
102+ run : |
103+ cat /etc/os-release
104+ mkdir -p "${TMPDIR}"
105+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+ - name : Checkout
108+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+ with :
110+ # https://github.com/actions/checkout/issues/249
111+ fetch-depth : 0
112+
113+ - name : Determine if pr_or_branch is a PR number
114+ id : check_pr
115+ run : |
116+ PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
117+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
118+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
119+ else
120+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
121+ fi
122+ echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
123+
124+ - name : Check if gh cli is installed
125+ id : gh_cli
126+ run : |
127+ if command -v gh &> /dev/null ; then
128+ echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
129+ else
130+ echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
131+ fi
132+
133+ - name : Install gh CLI
134+ if : steps.gh_cli.outputs.gh_cli_installed == 'false'
135+ run : |
136+ sudo dnf install 'dnf-command(config-manager)' -y
137+ sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
138+ sudo dnf install gh --repo gh-cli -y
139+
140+ - name : test gh CLI
141+ run : |
142+ gh --version
143+
144+ - name : set default repo
145+ run : |
146+ gh repo set-default ${{ github.server_url }}/${{ github.repository }}
147+ env :
148+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
149+
150+ - name : Add comment to PR
151+ if : steps.check_pr.outputs.is_pr == 'true'
152+ run : |
153+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
154+ env :
155+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
156+
157+ - name : Fetch and checkout PR
158+ if : steps.check_pr.outputs.is_pr == 'true'
159+ run : |
160+ gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
161+ env :
162+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
163+
164+ - name : Checkout branch
165+ if : steps.check_pr.outputs.is_pr == 'false'
166+ run : |
167+ git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
168+
169+ - name : Update instructlab-training library
170+ run : |
171+ export CUDA_HOME="/usr/local/cuda"
172+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
173+ export PATH="$PATH:$CUDA_HOME/bin"
174+ nvidia-smi
175+ python3.11 -m venv --upgrade-deps venv
176+ . venv/bin/activate
177+ pip install instructlab
178+ pip install instructlab[cuda]
179+ python3.11 -m pip install packaging wheel setuptools-scm
180+ pip install .
181+ pip install .[cuda]
182+ python3.11 -m pip uninstall -y flash-attn
183+ python3.11 -m pip cache purge
184+ python3.11 -m pip install ninja
185+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
186+
187+ - name : Check disk before tests
188+ run : |
189+ df -h
190+
191+ # TODO: switch to downloading a ds rather than generating one
192+ # - name: Download SDG Dataset
193+ # working-directory: ./training
194+ # uses: actions/download-artifact@v4
195+ # with:
196+ # name: sdg-dataset.jsonl
197+ # path: dataset
198+
199+ - name : Run e2e test
200+ env :
201+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
202+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
203+ run : |
204+ . venv/bin/activate
205+
206+ ./scripts/test-sdk.sh
207+
208+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
209+ # and we know that it will be written into a directory created by `mktemp -d`.
210+ # Given this information, we can use the following command to find the file:
211+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
212+ phase_num=1;
213+ for log_file in $log_files; do
214+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
215+ ((phase_num++))
216+ done
217+
218+ - name : Check disk after tests
219+ run : |
220+ df -h
221+
222+ - name : Upload training logs Phase 1
223+ uses : actions/upload-artifact@v4
224+ with :
225+ name : phase-1-training-log.jsonl
226+ path : ./phase-1-training-log.jsonl
227+ retention-days : 1
228+ overwrite : true
229+
230+ - name : Upload training logs Phase 2
231+ uses : actions/upload-artifact@v4
232+ with :
233+ name : phase-2-training-log.jsonl
234+ path : ./phase-2-training-log.jsonl
235+ retention-days : 1
236+ overwrite : true
237+
238+ - name : Add comment to PR if the workflow failed
239+ if : failure() && steps.check_pr.outputs.is_pr == 'true'
240+ run : |
241+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
242+ env :
243+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
244+
245+ - name : Add comment to PR if the workflow succeeded
246+ if : success() && steps.check_pr.outputs.is_pr == 'true'
247+ run : |
248+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
249+ env :
250+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
251+
252+ stop-large-ec2-runner :
253+ needs :
254+ - start-large-ec2-runner
255+ - e2e-large-test
256+ runs-on : ubuntu-latest
257+ if : ${{ always() }}
258+ steps :
259+ - name : " Harden Runner"
260+ # v2.10.1
261+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
262+ with :
263+ egress-policy : audit
264+
265+ - name : Configure AWS credentials
266+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
267+ with :
268+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
269+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
270+ aws-region : ${{ vars.AWS_REGION }}
271+
272+ - name : Stop EC2 runner
273+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
274+ with :
275+ mode : stop
276+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
277+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
278+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
279+
280+ loss-graphs :
281+ needs :
282+ - stop-large-ec2-runner
283+ runs-on : ubuntu-latest
284+ if : ${{ always() }}
285+ steps :
286+ - name : " Harden Runner"
287+ # v2.10.1
288+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
289+ with :
290+ egress-policy : audit
291+
292+ - name : Configure AWS credentials
293+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
294+ with :
295+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
296+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
297+ aws-region : ${{ vars.AWS_REGION }}
298+
299+ - name : Download loss data Phase 1
300+ id : phase-1-download-logs
301+ uses : actions/download-artifact@v4
302+ with :
303+ name : phase-1-training-log.jsonl
304+ path : downloaded-data
305+
306+ - name : Download loss data Phase 2
307+ id : phase-2-download-logs
308+ uses : actions/download-artifact@v4
309+ with :
310+ name : phase-2-training-log.jsonl
311+ path : downloaded-data
312+
313+ - name : Checkout
314+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
315+ with :
316+ # https://github.com/actions/checkout/issues/249
317+ fetch-depth : 0
318+
319+ - name : Install dependencies
320+ run : |
321+ python -m pip install --upgrade pip
322+ pip install -r requirements-dev.txt
323+
324+ - name : Try to upload Phase 1 to s3
325+ id : phase-1-upload-s3
326+ continue-on-error : true
327+ run : |
328+ python ./scripts/create-loss-graph.py \
329+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
330+ --output-file "./phase-1-test.md" \
331+ --phase "1" \
332+ --aws-region "${{ vars.AWS_REGION }}" \
333+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
334+ --base-branch "${GITHUB_REF##*/}" \
335+ --head-sha "${{ github.sha }}" \
336+ --pr-number "${{ github.event.number }}" \
337+ --origin-repository "${{ github.repository }}"
338+
339+ - name : Try to upload Phase 2 to s3
340+ id : phase-2-upload-s3
341+ continue-on-error : true
342+ run : |
343+ python ./scripts/create-loss-graph.py \
344+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
345+ --output-file "./phase-2-test.md" \
346+ --phase "2" \
347+ --aws-region "${{ vars.AWS_REGION }}" \
348+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
349+ --base-branch "${GITHUB_REF##*/}" \
350+ --head-sha "${{ github.sha }}" \
351+ --pr-number "${{ github.event.number }}" \
352+ --origin-repository "${{ github.repository }}"
353+
354+ - name : Check Phase 1 S3 upload status for success
355+ if : steps.phase-1-upload-s3.outcome == 'success'
356+ run : |
357+ echo "Uploaded Phase 1 loss graph to S3."
358+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
359+
360+ - name : Check Phase 2 S3 upload status for success
361+ if : steps.phase-2-upload-s3.outcome == 'success'
362+ run : |
363+ echo "Uploaded Phase 2 loss graph to S3."
364+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
365+
366+ - name : Check Phase 1 S3 upload status for failure
367+ if : steps.phase-1-upload-s3.outcome == 'failure'
368+ run : |
369+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
370+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
371+
372+ - name : Check Phase 2 S3 upload status for failure
373+ if : steps.phase-2-upload-s3.outcome == 'failure'
374+ run : |
375+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
376+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments