33name : E2E (NVIDIA L40S x4) SDK Test
44
55on :
6+ pull_request :
7+ branches :
8+ - " main"
9+ schedule :
10+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
611 workflow_dispatch :
712 inputs :
813 pr_or_branch :
914 description : ' pull request number or branch name'
1015 required : true
1116 default : ' main'
12- jobs:
17+ concurrency :
18+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+ cancel-in-progress : true
20+
21+ env :
22+ TMPDIR : /home/tmp
23+
24+ jobs :
25+ start-large-ec2-runner :
26+ runs-on : ubuntu-latest
27+ outputs :
28+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+ steps :
32+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+ with :
35+ repository : instructlab/ci-actions
36+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+ path : ci-actions
38+ ref : release-v0.1
39+ sparse-checkout : |
40+ actions/launch-ec2-runner-with-fallback
41+
42+ - name : Launch EC2 Runner with Fallback
43+ id : launch-ec2-instance-with-fallback
44+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45+ env :
46+ TMPDIR : " /tmp"
47+ with :
48+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+ regions_config : >
52+ [
53+ {
54+ "region": "us-east-2",
55+ "subnets": {
56+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+ },
60+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+ },
63+ {
64+ "region": "us-east-1",
65+ "subnets": {
66+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+ },
73+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+ }
76+ ]
77+ try_spot_instance_first : false
78+ ec2_instance_type : g6e.12xlarge
79+ aws_resource_tags : >
80+ [
81+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+ ]
86+
87+ e2e-large-test :
88+ needs :
89+ - start-large-ec2-runner
90+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+ permissions :
93+ pull-requests : write
94+
95+ steps :
96+ - name : " Harden Runner"
97+ # v2.10.1
98+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+ with :
100+ egress-policy : audit
101+ - name : Install Packages
102+ run : |
103+ cat /etc/os-release
104+ mkdir -p "${TMPDIR}"
105+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+ - name : Checkout
108+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+ with :
110+ # https://github.com/actions/checkout/issues/249
111+ fetch-depth : 0
112+
113+ - name : Install dependent PRs if needed
114+ uses : depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115+ with :
116+ token : ${{ secrets.GITHUB_TOKEN }}
117+
118+ - name : Fetch and checkout PR
119+ if : ${{ github.event_name == 'pull_request_target' }}
120+ run : |
121+ git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122+ git checkout pr-${{ github.event.number }}
123+
124+ - name : Update instructlab-training library
125+ run : |
126+ export CUDA_HOME="/usr/local/cuda"
127+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128+ export PATH="$PATH:$CUDA_HOME/bin"
129+ nvidia-smi
130+ python3.11 -m venv --upgrade-deps venv
131+ . venv/bin/activate
132+ pip install instructlab
133+ pip install instructlab[cuda]
134+ python3.11 -m pip install packaging wheel setuptools-scm
135+ pip install .
136+ pip install .[cuda]
137+ python3.11 -m pip uninstall -y flash-attn
138+ python3.11 -m pip cache purge
139+ python3.11 -m pip install ninja
140+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
141+
142+ - name : Check disk before tests
143+ run : |
144+ df -h
145+
146+ # TODO: switch to downloading a ds rather than generating one
147+ # - name: Download SDG Dataset
148+ # working-directory: ./training
149+ # uses: actions/download-artifact@v4
150+ # with:
151+ # name: sdg-dataset.jsonl
152+ # path: dataset
153+
154+ - name : Run e2e test
155+ env :
156+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
157+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
158+ run : |
159+ . venv/bin/activate
160+ ls scripts
161+ ls ./
162+ ./scripts/test-sdk.sh
163+
164+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
165+ # and we know that it will be written into a directory created by `mktemp -d`.
166+ # Given this information, we can use the following command to find the file:
167+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
168+ phase_num=1;
169+ for log_file in $log_files; do
170+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
171+ ((phase_num++))
172+ done
173+
174+ - name : Check disk after tests
175+ run : |
176+ df -h
177+
178+ - name : Upload training logs Phase 1
179+ uses : actions/upload-artifact@v4
180+ with :
181+ name : phase-1-training-log.jsonl
182+ path : ./phase-1-training-log.jsonl
183+ retention-days : 1
184+ overwrite : true
185+
186+ - name : Upload training logs Phase 2
187+ uses : actions/upload-artifact@v4
188+ with :
189+ name : phase-2-training-log.jsonl
190+ path : ./phase-2-training-log.jsonl
191+ retention-days : 1
192+ overwrite : true
193+
194+ stop-large-ec2-runner :
195+ needs :
196+ - start-large-ec2-runner
197+ - e2e-large-test
198+ runs-on : ubuntu-latest
199+ if : ${{ always() }}
200+ steps :
201+ - name : " Harden Runner"
202+ # v2.10.1
203+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
204+ with :
205+ egress-policy : audit
206+
207+ - name : Configure AWS credentials
208+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
209+ with :
210+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
211+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
212+ aws-region : ${{ vars.AWS_REGION }}
213+
214+ - name : Stop EC2 runner
215+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
216+ with :
217+ mode : stop
218+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
219+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
220+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
221+
222+ loss-graphs :
223+ needs :
224+ - stop-large-ec2-runner
225+ runs-on : ubuntu-latest
226+ if : ${{ always() }}
227+ steps :
228+ - name : " Harden Runner"
229+ # v2.10.1
230+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
231+ with :
232+ egress-policy : audit
233+
234+ - name : Configure AWS credentials
235+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
236+ with :
237+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
238+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
239+ aws-region : ${{ vars.AWS_REGION }}
240+
241+ - name : Download loss data Phase 1
242+ id : phase-1-download-logs
243+ uses : actions/download-artifact@v4
244+ with :
245+ name : phase-1-training-log.jsonl
246+ path : downloaded-data
247+
248+ - name : Download loss data Phase 2
249+ id : phase-2-download-logs
250+ uses : actions/download-artifact@v4
251+ with :
252+ name : phase-2-training-log.jsonl
253+ path : downloaded-data
254+
255+ - name : Checkout
256+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
257+ with :
258+ # https://github.com/actions/checkout/issues/249
259+ fetch-depth : 0
260+
261+ - name : Install dependencies
262+ run : |
263+ python -m pip install --upgrade pip
264+ pip install -r requirements-dev.txt
265+
266+ - name : Try to upload Phase 1 to s3
267+ id : phase-1-upload-s3
268+ continue-on-error : true
269+ run : |
270+ python ./scripts/create-loss-graph.py \
271+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
272+ --output-file "./phase-1-test.md" \
273+ --phase "1" \
274+ --aws-region "${{ vars.AWS_REGION }}" \
275+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
276+ --base-branch "${GITHUB_REF##*/}" \
277+ --head-sha "${{ github.sha }}" \
278+ --pr-number "${{ github.event.number }}" \
279+ --origin-repository "${{ github.repository }}"
280+
281+ - name : Try to upload Phase 2 to s3
282+ id : phase-2-upload-s3
283+ continue-on-error : true
284+ run : |
285+ python ./scripts/create-loss-graph.py \
286+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
287+ --output-file "./phase-2-test.md" \
288+ --phase "2" \
289+ --aws-region "${{ vars.AWS_REGION }}" \
290+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
291+ --base-branch "${GITHUB_REF##*/}" \
292+ --head-sha "${{ github.sha }}" \
293+ --pr-number "${{ github.event.number }}" \
294+ --origin-repository "${{ github.repository }}"
295+
296+ - name : Check Phase 1 S3 upload status for success
297+ if : steps.phase-1-upload-s3.outcome == 'success'
298+ run : |
299+ echo "Uploaded Phase 1 loss graph to S3."
300+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
301+
302+ - name : Check Phase 2 S3 upload status for success
303+ if : steps.phase-2-upload-s3.outcome == 'success'
304+ run : |
305+ echo "Uploaded Phase 2 loss graph to S3."
306+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
307+
308+ - name : Check Phase 1 S3 upload status for failure
309+ if : steps.phase-1-upload-s3.outcome == 'failure'
310+ run : |
311+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
312+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
313+
314+ - name : Check Phase 2 S3 upload status for failure
315+ if : steps.phase-2-upload-s3.outcome == 'failure'
316+ run : |
317+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
318+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments