Skip to content

Commit bb8d66f

Browse files
Add XL e2e nightly CI job
Add a new XL e2e nightly CI job that triggers every day at 6am UTC. Also update the existing large CI job so that uploaded files are not overwritten. Signed-off-by: Courtney Pacheco <[email protected]>
1 parent 8969740 commit bb8d66f

File tree

3 files changed

+534
-14
lines changed

3 files changed

+534
-14
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: 'Free Disk Space'
2+
description: 'Frees disk space on the runner'
3+
runs:
4+
using: "composite"
5+
steps:
6+
- name: Print disk space before cleanup
7+
run: |
8+
df -h
9+
shell: bash
10+
- name: Free Disk Space Linux
11+
if: runner.os == 'Linux'
12+
run: |
13+
# Determine if we have Ubuntu, CentOS, or other distro as our runner OS
14+
os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2)
15+
echo "Detected OS distro as: ${os_id}"
16+
17+
# Sometimes `docker` is not installed, so only remove images if we need to.
18+
if command -v docker 2>&1 >/dev/null ; then
19+
sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true
20+
fi
21+
22+
# Remove Android, .NET, and Haskell runtimes
23+
sudo rm -rf \
24+
/usr/local/lib/android \
25+
/usr/share/dotnet \
26+
/opt/ghc \
27+
/usr/local/.ghcup \
28+
/usr/local/share/powershell \
29+
/usr/share/swift \
30+
/usr/lib/jvm || true
31+
32+
printWarningMessage () {
33+
echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..."
34+
}
35+
36+
# Remove large packages we don't use.
37+
echo "Attempting to remove unused ${os_id} packages..."
38+
if [[ "${os_id}" == "ubuntu" ]]; then
39+
sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*'
40+
sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*'
41+
sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*'
42+
sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*'
43+
sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*'
44+
sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk'
45+
sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli'
46+
sudo apt-get autoremove -y >/dev/null 2>&1
47+
sudo apt-get autoclean -y >/dev/null 2>&1
48+
elif [[ "${os_id}" == "centos" ]]; then
49+
sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*'
50+
sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*'
51+
sudo dnf -y remove 'php-*' || printWarningMessage 'php-*'
52+
sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*'
53+
sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*'
54+
sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk'
55+
sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli'
56+
sudo dnf clean all
57+
rm -rf /var/cache/dnf*
58+
else
59+
echo "Unrecognized OS '${os_id}'. Skipping large package cleanup, as this logic has not been implemented for ${os_id}."
60+
fi
61+
shell: bash
62+
- name: Free Disk Space MacOS
63+
if: runner.os == 'macOS'
64+
run: |
65+
sudo rm -rf /System/Volumes/Data/Applications/Xcode_15*
66+
shell: bash
67+
- name: Print disk space after cleanup
68+
run: |
69+
df -h
70+
shell: bash

.github/workflows/e2e-nvidia-l40s-x4.yml

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@
33
name: E2E (NVIDIA L40S x4)
44

55
on:
6-
schedule:
7-
- cron: '0 16 * * *' # Runs at 4PM UTC every day
8-
workflow_dispatch:
9-
inputs:
10-
pr_or_branch:
11-
description: 'pull request number or branch name'
12-
required: true
13-
default: 'main'
6+
### WILL BE UNCOMMENTED BEFORE MERGE
7+
# schedule:
8+
# - cron: '0 16 * * *' # Runs at 4PM UTC every day
9+
# workflow_dispatch:
10+
# inputs:
11+
# pr_or_branch:
12+
# description: 'pull request number or branch name'
13+
# required: true
14+
# default: 'main'
15+
16+
# FOR TESTING ON GITHUB ONLY. WILL BE REMOVED.
17+
push:
18+
branches:
19+
- courtneypacheco-add-xl-e2e-job
1420

1521
env:
1622
TMPDIR: /home/tmp
@@ -206,15 +212,15 @@ jobs:
206212
- name: Upload training logs Phase 1
207213
uses: actions/upload-artifact@v4
208214
with:
209-
name: phase-1-training-log.jsonl
215+
name: phase-1-training-log-large.jsonl
210216
path: ./instructlab/phase-1-training-log.jsonl
211217
retention-days: 1
212218
overwrite: true
213219

214220
- name: Upload training logs Phase 2
215221
uses: actions/upload-artifact@v4
216222
with:
217-
name: phase-2-training-log.jsonl
223+
name: phase-2-training-log-large.jsonl
218224
path: ./instructlab/phase-2-training-log.jsonl
219225
retention-days: 1
220226
overwrite: true
@@ -338,14 +344,14 @@ jobs:
338344
id: phase-1-download-logs
339345
uses: actions/download-artifact@v4
340346
with:
341-
name: phase-1-training-log.jsonl
347+
name: phase-1-training-log-large.jsonl
342348
path: downloaded-data
343349

344350
- name: Download loss data Phase 2
345351
id: phase-2-download-logs
346352
uses: actions/download-artifact@v4
347353
with:
348-
name: phase-2-training-log.jsonl
354+
name: phase-2-training-log-large.jsonl
349355
path: downloaded-data
350356

351357
- name: Checkout instructlab/training
@@ -366,7 +372,7 @@ jobs:
366372
continue-on-error: true
367373
run: |
368374
python training/scripts/create-loss-graph.py \
369-
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
375+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log-large.jsonl" \
370376
--output-file "./phase-1-test.md" \
371377
--phase "1" \
372378
--aws-region "${{ vars.AWS_REGION }}" \
@@ -381,7 +387,7 @@ jobs:
381387
continue-on-error: true
382388
run: |
383389
python training/scripts/create-loss-graph.py \
384-
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
390+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log-large.jsonl" \
385391
--output-file "./phase-2-test.md" \
386392
--phase "2" \
387393
--aws-region "${{ vars.AWS_REGION }}" \

0 commit comments

Comments
 (0)