1+ # SPDX-License-Identifier: Apache-2.0
2+
3+ name : E2E (NVIDIA L40S x4) SDK Test
4+
5+ on :
6+ pull_request :
7+ branches :
8+ - " main"
9+ schedule :
10+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
11+ workflow_dispatch :
12+ inputs :
13+ pr_or_branch :
14+ description : ' pull request number or branch name'
15+ required : true
16+ default : ' main'
17+ concurrency :
18+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+ cancel-in-progress : true
20+
21+ env :
22+ TMPDIR : /home/tmp
23+
24+ jobs :
25+ start-large-ec2-runner :
26+ runs-on : ubuntu-latest
27+ outputs :
28+ label : ${{ steps.start-ec2-runner.outputs.label }}
29+ ec2-instance-id : ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
30+ steps :
31+ - name : " Harden Runner"
32+ # v2.10.1
33+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
34+ with :
35+ egress-policy : audit
36+
37+ - name : Configure AWS credentials
38+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
39+ with :
40+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
41+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
42+ aws-region : ${{ vars.AWS_REGION }}
43+
44+ - name : Start EC2 runner
45+ id : start-ec2-runner
46+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
47+ with :
48+ mode : start
49+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
50+ ec2-image-id : ${{ vars.AWS_EC2_AMI }}
51+ ec2-instance-type : g6e.12xlarge
52+ subnet-id : subnet-024298cefa3bedd61
53+ security-group-id : sg-06300447c4a5fbef3
54+ iam-role-name : instructlab-ci-runner
55+ aws-resource-tags : >
56+ [
57+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
58+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
59+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
60+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
61+ ]
62+
63+ e2e-large-test :
64+ needs :
65+ - start-large-ec2-runner
66+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
67+
68+ permissions :
69+ pull-requests : write
70+
71+ steps :
72+ - name : " Harden Runner"
73+ # v2.10.1
74+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
75+ with :
76+ egress-policy : audit
77+ - name : Install Packages
78+ run : |
79+ cat /etc/os-release
80+ mkdir -p "${TMPDIR}"
81+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
82+
83+ - name : Checkout instructlab/training
84+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
85+ with :
86+ repository : " instructlab/training"
87+ path : " training"
88+ # https://github.com/actions/checkout/issues/249
89+ fetch-depth : 0
90+
91+ - name : Determine if pr_or_branch is a PR number
92+ id : check_pr
93+ run : |
94+ PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
95+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
96+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
97+ else
98+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
99+ fi
100+ echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
101+
102+ - name : Check if gh cli is installed
103+ id : gh_cli
104+ run : |
105+ if command -v gh &> /dev/null ; then
106+ echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
107+ else
108+ echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
109+ fi
110+
111+ - name : Install gh CLI
112+ if : steps.gh_cli.outputs.gh_cli_installed == 'false'
113+ run : |
114+ sudo dnf install 'dnf-command(config-manager)' -y
115+ sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
116+ sudo dnf install gh --repo gh-cli -y
117+
118+ - name : test gh CLI
119+ run : |
120+ gh --version
121+
122+ - name : set default repo
123+ working-directory : ./training
124+ run : |
125+ gh repo set-default ${{ github.server_url }}/${{ github.repository }}
126+ env :
127+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
128+
129+ - name : Add comment to PR
130+ if : steps.check_pr.outputs.is_pr == 'true'
131+ working-directory : ./training
132+ run : |
133+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
134+ env :
135+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
136+
137+ - name : Fetch and checkout PR
138+ if : steps.check_pr.outputs.is_pr == 'true'
139+ working-directory : ./training
140+ run : |
141+ gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
142+ env :
143+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
144+
145+ - name : Checkout branch
146+ if : steps.check_pr.outputs.is_pr == 'false'
147+ working-directory : ./training
148+ run : |
149+ git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
150+
151+ - name : Update instructlab-training library
152+ working-directory : ./training
153+ run : |
154+ export CUDA_HOME="/usr/local/cuda"
155+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
156+ export PATH="$PATH:$CUDA_HOME/bin"
157+ nvidia-smi
158+ python3.11 -m venv --upgrade-deps venv
159+ . venv/bin/activate
160+ pip install instructlab
161+ pip install instructlab[cuda]
162+ python3.11 -m pip install packaging wheel setuptools-scm
163+ pip install .
164+ pip install .[cuda]
165+
166+ - name : Check disk before tests
167+ run : |
168+ df -h
169+
170+ # TODO: switch to downloading a ds rather than generating one
171+ # - name: Download SDG Dataset
172+ # working-directory: ./training
173+ # uses: actions/download-artifact@v4
174+ # with:
175+ # name: sdg-dataset.jsonl
176+ # path: dataset
177+
178+ - name : Run e2e test
179+ working-directory : ./training
180+ env :
181+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
182+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
183+ run : |
184+ . venv/bin/activate
185+
186+ # set preserve to true so we can retain the logs
187+ ./scripts/test_sdh.sh
188+
189+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
190+ # and we know that it will be written into a directory created by `mktemp -d`.
191+ # Given this information, we can use the following command to find the file:
192+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
193+ phase_num=1;
194+ for log_file in $log_files; do
195+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
196+ ((phase_num++))
197+ done
198+
199+ - name : Check disk after tests
200+ run : |
201+ df -h
202+
203+ - name : Upload training logs Phase 1
204+ uses : actions/upload-artifact@v4
205+ with :
206+ name : phase-1-training-log.jsonl
207+ path : ./training/phase-1-training-log.jsonl
208+ retention-days : 1
209+ overwrite : true
210+
211+ - name : Upload training logs Phase 2
212+ uses : actions/upload-artifact@v4
213+ with :
214+ name : phase-2-training-log.jsonl
215+ path : ./training/phase-2-training-log.jsonl
216+ retention-days : 1
217+ overwrite : true
218+
219+ - name : Add comment to PR if the workflow failed
220+ if : failure() && steps.check_pr.outputs.is_pr == 'true'
221+ working-directory : ./training
222+ run : |
223+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
224+ env :
225+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
226+
227+ - name : Add comment to PR if the workflow succeeded
228+ if : success() && steps.check_pr.outputs.is_pr == 'true'
229+ working-directory : ./training
230+ run : |
231+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
232+ env :
233+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
234+
235+ stop-large-ec2-runner :
236+ needs :
237+ - start-large-ec2-runner
238+ - e2e-large-test
239+ runs-on : ubuntu-latest
240+ if : ${{ always() }}
241+ steps :
242+ - name : " Harden Runner"
243+ # v2.10.1
244+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
245+ with :
246+ egress-policy : audit
247+
248+ - name : Configure AWS credentials
249+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
250+ with :
251+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
252+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
253+ aws-region : ${{ vars.AWS_REGION }}
254+
255+ - name : Stop EC2 runner
256+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
257+ with :
258+ mode : stop
259+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
260+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
261+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
262+
263+ loss-graphs :
264+ needs :
265+ - stop-large-ec2-runner
266+ runs-on : ubuntu-latest
267+ if : ${{ always() }}
268+ steps :
269+ - name : " Harden Runner"
270+ # v2.10.1
271+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
272+ with :
273+ egress-policy : audit
274+
275+ - name : Configure AWS credentials
276+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
277+ with :
278+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
279+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
280+ aws-region : ${{ vars.AWS_REGION }}
281+
282+ - name : Download loss data Phase 1
283+ id : phase-1-download-logs
284+ uses : actions/download-artifact@v4
285+ with :
286+ name : phase-1-training-log.jsonl
287+ path : downloaded-data
288+
289+ - name : Download loss data Phase 2
290+ id : phase-2-download-logs
291+ uses : actions/download-artifact@v4
292+ with :
293+ name : phase-2-training-log.jsonl
294+ path : downloaded-data
295+
296+ - name : Checkout instructlab/training
297+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
298+ with :
299+ repository : " instructlab/training"
300+ path : " training"
301+ fetch-depth : 0
302+
303+ - name : Install dependencies
304+ working-directory : ./training
305+ run : |
306+ python -m pip install --upgrade pip
307+ pip install -r requirements-dev.txt
308+
309+ - name : Try to upload Phase 1 to s3
310+ id : phase-1-upload-s3
311+ continue-on-error : true
312+ run : |
313+ python training/scripts/create-loss-graph.py \
314+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
315+ --output-file "./phase-1-test.md" \
316+ --phase "1" \
317+ --aws-region "${{ vars.AWS_REGION }}" \
318+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
319+ --base-branch "${GITHUB_REF##*/}" \
320+ --head-sha "${{ github.sha }}" \
321+ --pr-number "${{ github.event.number }}" \
322+ --origin-repository "${{ github.repository }}"
323+
324+ - name : Try to upload Phase 2 to s3
325+ id : phase-2-upload-s3
326+ continue-on-error : true
327+ run : |
328+ python training/scripts/create-loss-graph.py \
329+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
330+ --output-file "./phase-2-test.md" \
331+ --phase "2" \
332+ --aws-region "${{ vars.AWS_REGION }}" \
333+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
334+ --base-branch "${GITHUB_REF##*/}" \
335+ --head-sha "${{ github.sha }}" \
336+ --pr-number "${{ github.event.number }}" \
337+ --origin-repository "${{ github.repository }}"
338+
339+ - name : Check Phase 1 S3 upload status for success
340+ if : steps.phase-1-upload-s3.outcome == 'success'
341+ run : |
342+ echo "Uploaded Phase 1 loss graph to S3."
343+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
344+
345+ - name : Check Phase 2 S3 upload status for success
346+ if : steps.phase-2-upload-s3.outcome == 'success'
347+ run : |
348+ echo "Uploaded Phase 2 loss graph to S3."
349+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
350+
351+ - name : Check Phase 1 S3 upload status for failure
352+ if : steps.phase-1-upload-s3.outcome == 'failure'
353+ run : |
354+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
355+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
356+
357+ - name : Check Phase 2 S3 upload status for failure
358+ if : steps.phase-2-upload-s3.outcome == 'failure'
359+ run : |
360+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
361+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments