33name : E2E (NVIDIA L40S x4) SDK Test
44
55on :
6+ pull_request :
7+ branches :
8+ - " main"
9+ schedule :
10+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
611 workflow_dispatch :
712 inputs :
813 pr_or_branch :
914 description : ' pull request number or branch name'
1015 required : true
1116 default : ' main'
12- jobs:
17+ concurrency :
18+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+ cancel-in-progress : true
20+
21+ env :
22+ TMPDIR : /home/tmp
23+
24+ jobs :
25+ start-large-ec2-runner :
26+ runs-on : ubuntu-latest
27+ outputs :
28+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+ steps :
32+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+ with :
35+ repository : instructlab/ci-actions
36+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+ path : ci-actions
38+ ref : release-v0.1
39+ sparse-checkout : |
40+ actions/launch-ec2-runner-with-fallback
41+
42+ - name : Launch EC2 Runner with Fallback
43+ id : launch-ec2-instance-with-fallback
44+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45+ env :
46+ TMPDIR : " /tmp"
47+ with :
48+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+ regions_config : >
52+ [
53+ {
54+ "region": "us-east-2",
55+ "subnets": {
56+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+ },
60+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+ },
63+ {
64+ "region": "us-east-1",
65+ "subnets": {
66+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+ },
73+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+ }
76+ ]
77+ try_spot_instance_first : false
78+ ec2_instance_type : g6e.12xlarge
79+ aws_resource_tags : >
80+ [
81+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+ ]
86+
87+ e2e-large-test :
88+ needs :
89+ - start-large-ec2-runner
90+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+ permissions :
93+ pull-requests : write
94+
95+ steps :
96+ - name : " Harden Runner"
97+ # v2.10.1
98+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+ with :
100+ egress-policy : audit
101+ - name : Install Packages
102+ run : |
103+ cat /etc/os-release
104+ mkdir -p "${TMPDIR}"
105+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+ - name : Checkout instructlab/training
108+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+ with :
110+ repository : " instructlab/training"
111+ path : " training"
112+ # https://github.com/actions/checkout/issues/249
113+ fetch-depth : 0
114+
115+ - name : Determine if pr_or_branch is a PR number
116+ id : check_pr
117+ run : |
118+ PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
119+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
120+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
121+ else
122+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
123+ fi
124+ echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
125+
126+ - name : Check if gh cli is installed
127+ id : gh_cli
128+ run : |
129+ if command -v gh &> /dev/null ; then
130+ echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
131+ else
132+ echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
133+ fi
134+
135+ - name : Install gh CLI
136+ if : steps.gh_cli.outputs.gh_cli_installed == 'false'
137+ run : |
138+ sudo dnf install 'dnf-command(config-manager)' -y
139+ sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
140+ sudo dnf install gh --repo gh-cli -y
141+
142+ - name : test gh CLI
143+ run : |
144+ gh --version
145+
146+ - name : set default repo
147+ working-directory : ./training
148+ run : |
149+ gh repo set-default ${{ github.server_url }}/${{ github.repository }}
150+ env :
151+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
152+
153+ - name : Add comment to PR
154+ if : steps.check_pr.outputs.is_pr == 'true'
155+ working-directory : ./training
156+ run : |
157+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
158+ env :
159+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
160+
161+ - name : Fetch and checkout PR
162+ if : steps.check_pr.outputs.is_pr == 'true'
163+ working-directory : ./training
164+ run : |
165+ gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
166+ env :
167+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
168+
169+ - name : Checkout branch
170+ if : steps.check_pr.outputs.is_pr == 'false'
171+ working-directory : ./training
172+ run : |
173+ git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
174+
175+ - name : Update instructlab-training library
176+ working-directory : ./training
177+ run : |
178+ export CUDA_HOME="/usr/local/cuda"
179+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
180+ export PATH="$PATH:$CUDA_HOME/bin"
181+ nvidia-smi
182+ python3.11 -m venv --upgrade-deps venv
183+ . venv/bin/activate
184+ pip install instructlab
185+ pip install instructlab[cuda]
186+ python3.11 -m pip install packaging wheel setuptools-scm
187+ pip install .
188+ pip install .[cuda]
189+
190+ - name : Check disk before tests
191+ run : |
192+ df -h
193+
194+ # TODO: switch to downloading a ds rather than generating one
195+ # - name: Download SDG Dataset
196+ # working-directory: ./training
197+ # uses: actions/download-artifact@v4
198+ # with:
199+ # name: sdg-dataset.jsonl
200+ # path: dataset
201+
202+ - name : Run e2e test
203+ working-directory : ./training
204+ env :
205+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
206+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
207+ run : |
208+ . venv/bin/activate
209+
210+ # set preserve to true so we can retain the logs
211+ ./scripts/test_sdh.sh
212+
213+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
214+ # and we know that it will be written into a directory created by `mktemp -d`.
215+ # Given this information, we can use the following command to find the file:
216+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
217+ phase_num=1;
218+ for log_file in $log_files; do
219+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
220+ ((phase_num++))
221+ done
222+
223+ - name : Check disk after tests
224+ run : |
225+ df -h
226+
227+ - name : Upload training logs Phase 1
228+ uses : actions/upload-artifact@v4
229+ with :
230+ name : phase-1-training-log.jsonl
231+ path : ./training/phase-1-training-log.jsonl
232+ retention-days : 1
233+ overwrite : true
234+
235+ - name : Upload training logs Phase 2
236+ uses : actions/upload-artifact@v4
237+ with :
238+ name : phase-2-training-log.jsonl
239+ path : ./training/phase-2-training-log.jsonl
240+ retention-days : 1
241+ overwrite : true
242+
243+ - name : Add comment to PR if the workflow failed
244+ if : failure() && steps.check_pr.outputs.is_pr == 'true'
245+ working-directory : ./training
246+ run : |
247+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
248+ env :
249+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
250+
251+ - name : Add comment to PR if the workflow succeeded
252+ if : success() && steps.check_pr.outputs.is_pr == 'true'
253+ working-directory : ./training
254+ run : |
255+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
256+ env :
257+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
258+
259+ stop-large-ec2-runner :
260+ needs :
261+ - start-large-ec2-runner
262+ - e2e-large-test
263+ runs-on : ubuntu-latest
264+ if : ${{ always() }}
265+ steps :
266+ - name : " Harden Runner"
267+ # v2.10.1
268+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
269+ with :
270+ egress-policy : audit
271+
272+ - name : Configure AWS credentials
273+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
274+ with :
275+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
276+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
277+ aws-region : ${{ vars.AWS_REGION }}
278+
279+ - name : Stop EC2 runner
280+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
281+ with :
282+ mode : stop
283+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
284+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
285+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
286+
287+ loss-graphs :
288+ needs :
289+ - stop-large-ec2-runner
290+ runs-on : ubuntu-latest
291+ if : ${{ always() }}
292+ steps :
293+ - name : " Harden Runner"
294+ # v2.10.1
295+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
296+ with :
297+ egress-policy : audit
298+
299+ - name : Configure AWS credentials
300+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
301+ with :
302+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
303+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
304+ aws-region : ${{ vars.AWS_REGION }}
305+
306+ - name : Download loss data Phase 1
307+ id : phase-1-download-logs
308+ uses : actions/download-artifact@v4
309+ with :
310+ name : phase-1-training-log.jsonl
311+ path : downloaded-data
312+
313+ - name : Download loss data Phase 2
314+ id : phase-2-download-logs
315+ uses : actions/download-artifact@v4
316+ with :
317+ name : phase-2-training-log.jsonl
318+ path : downloaded-data
319+
320+ - name : Checkout instructlab/training
321+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
322+ with :
323+ repository : " instructlab/training"
324+ path : " training"
325+ fetch-depth : 0
326+
327+ - name : Install dependencies
328+ working-directory : ./training
329+ run : |
330+ python -m pip install --upgrade pip
331+ pip install -r requirements-dev.txt
332+
333+ - name : Try to upload Phase 1 to s3
334+ id : phase-1-upload-s3
335+ continue-on-error : true
336+ run : |
337+ python training/scripts/create-loss-graph.py \
338+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
339+ --output-file "./phase-1-test.md" \
340+ --phase "1" \
341+ --aws-region "${{ vars.AWS_REGION }}" \
342+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
343+ --base-branch "${GITHUB_REF##*/}" \
344+ --head-sha "${{ github.sha }}" \
345+ --pr-number "${{ github.event.number }}" \
346+ --origin-repository "${{ github.repository }}"
347+
348+ - name : Try to upload Phase 2 to s3
349+ id : phase-2-upload-s3
350+ continue-on-error : true
351+ run : |
352+ python training/scripts/create-loss-graph.py \
353+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
354+ --output-file "./phase-2-test.md" \
355+ --phase "2" \
356+ --aws-region "${{ vars.AWS_REGION }}" \
357+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
358+ --base-branch "${GITHUB_REF##*/}" \
359+ --head-sha "${{ github.sha }}" \
360+ --pr-number "${{ github.event.number }}" \
361+ --origin-repository "${{ github.repository }}"
362+
363+ - name : Check Phase 1 S3 upload status for success
364+ if : steps.phase-1-upload-s3.outcome == 'success'
365+ run : |
366+ echo "Uploaded Phase 1 loss graph to S3."
367+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
368+
369+ - name : Check Phase 2 S3 upload status for success
370+ if : steps.phase-2-upload-s3.outcome == 'success'
371+ run : |
372+ echo "Uploaded Phase 2 loss graph to S3."
373+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
374+
375+ - name : Check Phase 1 S3 upload status for failure
376+ if : steps.phase-1-upload-s3.outcome == 'failure'
377+ run : |
378+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
379+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
380+
381+ - name : Check Phase 2 S3 upload status for failure
382+ if : steps.phase-2-upload-s3.outcome == 'failure'
383+ run : |
384+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
385+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments