Skip to content

Commit a64779c

Browse files
committed
ci: Introduce python 3.12 e2e large job flavor
Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
1 parent 370102c commit a64779c

File tree

3 files changed

+455
-184
lines changed

3 files changed

+455
-184
lines changed

.github/actions/run-e2e/action.yml

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
name: 'Run e2e tests'
2+
description: 'Runs e2e tests'
3+
inputs:
4+
python-version:
5+
required: true
6+
description: >-
7+
Python version to use. Must be in the form of "3.xx".
8+
gh-token:
9+
required: true
10+
description: >-
11+
GitHub token to use for authentication.
12+
hf-token:
13+
required: true
14+
description: >-
15+
Hugging Face token to use for authentication.
16+
openai-api-key:
17+
required: true
18+
description: >-
19+
OpenAI API key to use for authentication.
20+
son-of-jeeves-discord-webhook:
21+
required: true
22+
description: >-
23+
Son of Jeeves webhook (Discord).
24+
runs:
25+
using: "composite"
26+
steps:
27+
- name: Install Packages
28+
shell: bash
29+
run: |
30+
cat /etc/os-release
31+
mkdir -p "${TMPDIR}"
32+
sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
33+
34+
- name: Checkout instructlab/instructlab
35+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
36+
with:
37+
repository: "instructlab/instructlab"
38+
path: "instructlab"
39+
# https://github.com/actions/checkout/issues/249
40+
fetch-depth: 0
41+
42+
- name: Determine if pr_or_branch is a PR number
43+
id: check_pr
44+
shell: bash
45+
run: |
46+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
47+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
48+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
49+
else
50+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
51+
fi
52+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
53+
54+
- name: Check if gh cli is installed
55+
id: gh_cli
56+
shell: bash
57+
run: |
58+
if command -v gh &> /dev/null ; then
59+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
60+
else
61+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
62+
fi
63+
64+
- name: Install gh CLI
65+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
66+
shell: bash
67+
run: |
68+
sudo dnf install 'dnf-command(config-manager)' -y
69+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
70+
sudo dnf install gh --repo gh-cli -y
71+
72+
- name: test gh CLI
73+
shell: bash
74+
run: |
75+
gh --version
76+
77+
- name: set default repo
78+
working-directory: ./training
79+
shell: bash
80+
run: |
81+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
82+
env:
83+
GH_TOKEN: ${{ inputs.gh-token }}
84+
85+
- name: Add comment to PR
86+
if: steps.check_pr.outputs.is_pr == 'true'
87+
working-directory: ./training
88+
shell: bash
89+
run: |
90+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
91+
env:
92+
GH_TOKEN: ${{ inputs.gh-token }}
93+
94+
- name: Fetch and checkout PR
95+
if: steps.check_pr.outputs.is_pr == 'true'
96+
working-directory: ./training
97+
shell: bash
98+
run: |
99+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
100+
env:
101+
GH_TOKEN: ${{ inputs.gh-token }}
102+
103+
- name: Checkout branch
104+
if: steps.check_pr.outputs.is_pr == 'false'
105+
working-directory: ./training
106+
shell: bash
107+
run: |
108+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
109+
110+
- name: Install ilab
111+
working-directory: ./instructlab
112+
shell: bash
113+
run: |
114+
PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh
115+
116+
- name: Update instructlab-training library
117+
working-directory: ./training
118+
shell: bash
119+
run: |
120+
. ../instructlab/venv/bin/activate
121+
pip install .
122+
pip install .[cuda]
123+
124+
- name: Check disk before tests
125+
if: always()
126+
shell: bash
127+
run: |
128+
df -h
129+
130+
- name: Run e2e test
131+
working-directory: ./instructlab
132+
env:
133+
HF_TOKEN: ${{ inputs.hf-token }}
134+
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
135+
shell: bash
136+
run: |
137+
. venv/bin/activate
138+
139+
# set preserve to true so we can retain the logs
140+
./scripts/e2e-ci.sh -lp
141+
142+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
143+
# and we know that it will be written into a directory created by `mktemp -d`.
144+
# Given this information, we can use the following command to find the file:
145+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
146+
phase_num=1;
147+
for log_file in $log_files; do
148+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
149+
((phase_num++))
150+
done
151+
152+
- name: Check disk after tests
153+
if: always()
154+
shell: bash
155+
run: |
156+
df -h
157+
158+
- name: Upload training logs Phase 1
159+
uses: actions/upload-artifact@v4
160+
with:
161+
name: phase-1-training-log.jsonl
162+
path: ./instructlab/phase-1-training-log.jsonl
163+
retention-days: 1
164+
overwrite: true
165+
166+
- name: Upload training logs Phase 2
167+
uses: actions/upload-artifact@v4
168+
with:
169+
name: phase-2-training-log.jsonl
170+
path: ./instructlab/phase-2-training-log.jsonl
171+
retention-days: 1
172+
overwrite: true
173+
174+
- name: Add comment to PR if the workflow failed
175+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
176+
working-directory: ./training
177+
shell: bash
178+
run: |
179+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
180+
env:
181+
GH_TOKEN: ${{ inputs.gh-token }}
182+
183+
- name: Add comment to PR if the workflow succeeded
184+
if: success() && steps.check_pr.outputs.is_pr == 'true'
185+
working-directory: ./training
186+
shell: bash
187+
run: |
188+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
189+
env:
190+
GH_TOKEN: ${{ inputs.gh-token }}
191+
192+
- name: Send Discord notification for failure
193+
if: failure() && steps.check_pr.outputs.is_pr == 'false'
194+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
195+
with:
196+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
197+
status: ${{ job.status }}
198+
title: "e2e-nvidia-l40s-x4"
199+
description: |
200+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
201+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
202+
color: 0xCB2431 # Red color for failure
203+
204+
- name: Send Discord notification for success
205+
if: success() && steps.check_pr.outputs.is_pr == 'false'
206+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
207+
with:
208+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
209+
status: ${{ job.status }}
210+
title: "e2e-nvidia-l40s-x4"
211+
description: |
212+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
213+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
214+
color: 0x28A745 # Green color for success

0 commit comments

Comments
 (0)