33name : E2E (NVIDIA L40S x4) SDK Test
44
55on :
6+ pull_request :
7+ branches :
8+ - " main"
9+ schedule :
10+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
611 workflow_dispatch :
712 inputs :
813 pr_or_branch :
914 description : ' pull request number or branch name'
1015 required : true
1116 default : ' main'
12- jobs:
17+ concurrency :
18+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+ cancel-in-progress : true
20+
21+ env :
22+ TMPDIR : /home/tmp
23+
24+ jobs :
25+ start-large-ec2-runner :
26+ runs-on : ubuntu-latest
27+ outputs :
28+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+ steps :
32+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+ with :
35+ repository : instructlab/ci-actions
36+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+ path : ci-actions
38+ ref : release-v0.1
39+ sparse-checkout : |
40+ actions/launch-ec2-runner-with-fallback
41+
42+ - name : Launch EC2 Runner with Fallback
43+ id : launch-ec2-instance-with-fallback
44+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45+ env :
46+ TMPDIR : " /tmp"
47+ with :
48+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+ regions_config : >
52+ [
53+ {
54+ "region": "us-east-2",
55+ "subnets": {
56+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+ },
60+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+ },
63+ {
64+ "region": "us-east-1",
65+ "subnets": {
66+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+ },
73+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+ }
76+ ]
77+ try_spot_instance_first : false
78+ ec2_instance_type : g6e.12xlarge
79+ aws_resource_tags : >
80+ [
81+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+ ]
86+
87+ e2e-large-test :
88+ needs :
89+ - start-large-ec2-runner
90+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+ permissions :
93+ pull-requests : write
94+
95+ steps :
96+ - name : " Harden Runner"
97+ # v2.10.1
98+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+ with :
100+ egress-policy : audit
101+ - name : Install Packages
102+ run : |
103+ cat /etc/os-release
104+ mkdir -p "${TMPDIR}"
105+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+ - name : Checkout
108+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+ with :
110+ # https://github.com/actions/checkout/issues/249
111+ fetch-depth : 0
112+
113+ - name : Install dependent PRs if needed
114+ uses : depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115+ with :
116+ token : ${{ secrets.GITHUB_TOKEN }}
117+
118+ - name : Fetch and checkout PR
119+ if : ${{ github.event_name == 'pull_request_target' }}
120+ run : |
121+ git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122+ git checkout pr-${{ github.event.number }}
123+
124+ - name : Update instructlab-training library
125+ run : |
126+ export CUDA_HOME="/usr/local/cuda"
127+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128+ export PATH="$PATH:$CUDA_HOME/bin"
129+ nvidia-smi
130+ python3.11 -m venv --upgrade-deps venv
131+ . venv/bin/activate
132+ pip install instructlab
133+ pip install instructlab[cuda]
134+ pip install vllm
135+ python3.11 -m pip install packaging wheel setuptools-scm
136+ pip install .
137+ pip install .[cuda]
138+ python3.11 -m pip uninstall -y flash-attn
139+ python3.11 -m pip cache purge
140+ python3.11 -m pip install ninja
141+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
142+
143+ - name : Check disk before tests
144+ run : |
145+ df -h
146+
147+ # TODO: switch to downloading a ds rather than generating one
148+ # - name: Download SDG Dataset
149+ # working-directory: ./training
150+ # uses: actions/download-artifact@v4
151+ # with:
152+ # name: sdg-dataset.jsonl
153+ # path: dataset
154+
155+ - name : Run e2e test
156+ env :
157+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
158+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
159+ run : |
160+ . venv/bin/activate
161+ ls scripts
162+ ls ./
163+ ./scripts/test-sdk.sh
164+
165+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
166+ # and we know that it will be written into a directory created by `mktemp -d`.
167+ # Given this information, we can use the following command to find the file:
168+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
169+ phase_num=1;
170+ for log_file in $log_files; do
171+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
172+ ((phase_num++))
173+ done
174+
175+ - name : Check disk after tests
176+ run : |
177+ df -h
178+
179+ - name : Upload training logs Phase 1
180+ uses : actions/upload-artifact@v4
181+ with :
182+ name : phase-1-training-log.jsonl
183+ path : ./phase-1-training-log.jsonl
184+ retention-days : 1
185+ overwrite : true
186+
187+ - name : Upload training logs Phase 2
188+ uses : actions/upload-artifact@v4
189+ with :
190+ name : phase-2-training-log.jsonl
191+ path : ./phase-2-training-log.jsonl
192+ retention-days : 1
193+ overwrite : true
194+
195+ stop-large-ec2-runner :
196+ needs :
197+ - start-large-ec2-runner
198+ - e2e-large-test
199+ runs-on : ubuntu-latest
200+ if : ${{ always() }}
201+ steps :
202+ - name : " Harden Runner"
203+ # v2.10.1
204+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
205+ with :
206+ egress-policy : audit
207+
208+ - name : Configure AWS credentials
209+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
210+ with :
211+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
212+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
213+ aws-region : ${{ vars.AWS_REGION }}
214+
215+ - name : Stop EC2 runner
216+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
217+ with :
218+ mode : stop
219+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
220+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
221+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
222+
223+ loss-graphs :
224+ needs :
225+ - stop-large-ec2-runner
226+ runs-on : ubuntu-latest
227+ if : ${{ always() }}
228+ steps :
229+ - name : " Harden Runner"
230+ # v2.10.1
231+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
232+ with :
233+ egress-policy : audit
234+
235+ - name : Configure AWS credentials
236+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
237+ with :
238+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
239+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
240+ aws-region : ${{ vars.AWS_REGION }}
241+
242+ - name : Download loss data Phase 1
243+ id : phase-1-download-logs
244+ uses : actions/download-artifact@v4
245+ with :
246+ name : phase-1-training-log.jsonl
247+ path : downloaded-data
248+
249+ - name : Download loss data Phase 2
250+ id : phase-2-download-logs
251+ uses : actions/download-artifact@v4
252+ with :
253+ name : phase-2-training-log.jsonl
254+ path : downloaded-data
255+
256+ - name : Checkout
257+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
258+ with :
259+ # https://github.com/actions/checkout/issues/249
260+ fetch-depth : 0
261+
262+ - name : Install dependencies
263+ run : |
264+ python -m pip install --upgrade pip
265+ pip install -r requirements-dev.txt
266+
267+ - name : Try to upload Phase 1 to s3
268+ id : phase-1-upload-s3
269+ continue-on-error : true
270+ run : |
271+ python ./scripts/create-loss-graph.py \
272+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
273+ --output-file "./phase-1-test.md" \
274+ --phase "1" \
275+ --aws-region "${{ vars.AWS_REGION }}" \
276+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
277+ --base-branch "${GITHUB_REF##*/}" \
278+ --head-sha "${{ github.sha }}" \
279+ --pr-number "${{ github.event.number }}" \
280+ --origin-repository "${{ github.repository }}"
281+
282+ - name : Try to upload Phase 2 to s3
283+ id : phase-2-upload-s3
284+ continue-on-error : true
285+ run : |
286+ python ./scripts/create-loss-graph.py \
287+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
288+ --output-file "./phase-2-test.md" \
289+ --phase "2" \
290+ --aws-region "${{ vars.AWS_REGION }}" \
291+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
292+ --base-branch "${GITHUB_REF##*/}" \
293+ --head-sha "${{ github.sha }}" \
294+ --pr-number "${{ github.event.number }}" \
295+ --origin-repository "${{ github.repository }}"
296+
297+ - name : Check Phase 1 S3 upload status for success
298+ if : steps.phase-1-upload-s3.outcome == 'success'
299+ run : |
300+ echo "Uploaded Phase 1 loss graph to S3."
301+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
302+
303+ - name : Check Phase 2 S3 upload status for success
304+ if : steps.phase-2-upload-s3.outcome == 'success'
305+ run : |
306+ echo "Uploaded Phase 2 loss graph to S3."
307+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
308+
309+ - name : Check Phase 1 S3 upload status for failure
310+ if : steps.phase-1-upload-s3.outcome == 'failure'
311+ run : |
312+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
313+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
314+
315+ - name : Check Phase 2 S3 upload status for failure
316+ if : steps.phase-2-upload-s3.outcome == 'failure'
317+ run : |
318+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
319+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments