Skip to content

Commit 3d0e845

Browse files
committed
ci: Introduce python 3.12 e2e large job flavor
Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
1 parent e8cb0a0 commit 3d0e845

File tree

3 files changed

+471
-185
lines changed

3 files changed

+471
-185
lines changed

.github/actions/run-e2e/action.yml

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
name: 'Run e2e tests'
2+
description: 'Runs e2e tests'
3+
inputs:
4+
python-version:
5+
required: true
6+
description: >-
7+
Python version to use. Must be in the form of "3.xx".
8+
gh-token:
9+
required: true
10+
description: >-
11+
GitHub token to use for authentication.
12+
hf-token:
13+
required: true
14+
description: >-
15+
Hugging Face token to use for authentication.
16+
openai-api-key:
17+
required: true
18+
description: >-
19+
OpenAI API key to use for authentication.
20+
son-of-jeeves-discord-webhook:
21+
required: true
22+
description: >-
23+
Son of Jeeves webhook (Discord).
24+
runs:
25+
using: "composite"
26+
steps:
27+
- name: Install Packages
28+
shell: bash
29+
run: |
30+
cat /etc/os-release
31+
mkdir -p "${TMPDIR}"
32+
sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
33+
34+
- name: Checkout instructlab/instructlab
35+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
36+
with:
37+
repository: "instructlab/instructlab"
38+
path: "instructlab"
39+
# https://github.com/actions/checkout/issues/249
40+
fetch-depth: 0
41+
42+
- name: Determine if pr_or_branch is a PR number
43+
id: check_pr
44+
shell: bash
45+
run: |
46+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
47+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
48+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
49+
else
50+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
51+
fi
52+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
53+
54+
- name: Check if gh cli is installed
55+
id: gh_cli
56+
shell: bash
57+
run: |
58+
if command -v gh &> /dev/null ; then
59+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
60+
else
61+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
62+
fi
63+
64+
- name: Install gh CLI
65+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
66+
shell: bash
67+
run: |
68+
sudo dnf install 'dnf-command(config-manager)' -y
69+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
70+
sudo dnf install gh --repo gh-cli -y
71+
72+
- name: test gh CLI
73+
shell: bash
74+
run: |
75+
gh --version
76+
77+
- name: set default repo
78+
working-directory: ./training
79+
shell: bash
80+
run: |
81+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
82+
env:
83+
GH_TOKEN: ${{ inputs.gh-token }}
84+
85+
- name: Add comment to PR
86+
if: steps.check_pr.outputs.is_pr == 'true'
87+
working-directory: ./training
88+
shell: bash
89+
run: |
90+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
91+
env:
92+
GH_TOKEN: ${{ inputs.gh-token }}
93+
94+
- name: Fetch and checkout PR
95+
if: steps.check_pr.outputs.is_pr == 'true'
96+
working-directory: ./training
97+
shell: bash
98+
run: |
99+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
100+
env:
101+
GH_TOKEN: ${{ inputs.gh-token }}
102+
103+
- name: Checkout branch
104+
if: steps.check_pr.outputs.is_pr == 'false'
105+
working-directory: ./training
106+
shell: bash
107+
run: |
108+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
109+
110+
- name: Install ilab
111+
working-directory: ./instructlab
112+
shell: bash
113+
run: |
114+
PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh
115+
116+
- name: Update instructlab-training library
117+
working-directory: ./training
118+
shell: bash
119+
run: |
120+
. ../instructlab/venv/bin/activate
121+
122+
# Patch out our own pin from the ilab repo constraints file
123+
ilab_constraints=../instructlab/constraints-dev.txt
124+
sed -i '/instructlab-training==/d' $ilab_constraints
125+
126+
# Since we reuse the virtual environment prepared using ilab
127+
# constraints, we should stick to the same constraints when
128+
# installing latest training.
129+
#
130+
# FIX: this is not ideal; a proper fix would require decoupling the
131+
# two repos in CI: either by removing the job completely and relying
132+
# on "sdk" (no ilab) test runs; or by preparing a separate
133+
# constraints file that would consider both the requirements files
134+
# for the training library AND for the ilab - so that they are
135+
# consistent.
136+
pip_install="pip install -c $ilab_constraints"
137+
$pip_install .
138+
$pip_install .[cuda]
139+
140+
- name: Check disk before tests
141+
if: always()
142+
shell: bash
143+
run: |
144+
df -h
145+
146+
- name: Run e2e test
147+
working-directory: ./instructlab
148+
env:
149+
HF_TOKEN: ${{ inputs.hf-token }}
150+
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
151+
shell: bash
152+
run: |
153+
. venv/bin/activate
154+
155+
# set preserve to true so we can retain the logs
156+
./scripts/e2e-ci.sh -lp
157+
158+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
159+
# and we know that it will be written into a directory created by `mktemp -d`.
160+
# Given this information, we can use the following command to find the file:
161+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
162+
phase_num=1;
163+
for log_file in $log_files; do
164+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
165+
((phase_num++))
166+
done
167+
168+
- name: Check disk after tests
169+
if: always()
170+
shell: bash
171+
run: |
172+
df -h
173+
174+
- name: Upload training logs Phase 1
175+
uses: actions/upload-artifact@v4
176+
with:
177+
name: phase-1-training-log.jsonl
178+
path: ./instructlab/phase-1-training-log.jsonl
179+
retention-days: 1
180+
overwrite: true
181+
182+
- name: Upload training logs Phase 2
183+
uses: actions/upload-artifact@v4
184+
with:
185+
name: phase-2-training-log.jsonl
186+
path: ./instructlab/phase-2-training-log.jsonl
187+
retention-days: 1
188+
overwrite: true
189+
190+
- name: Add comment to PR if the workflow failed
191+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
192+
working-directory: ./training
193+
shell: bash
194+
run: |
195+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
196+
env:
197+
GH_TOKEN: ${{ inputs.gh-token }}
198+
199+
- name: Add comment to PR if the workflow succeeded
200+
if: success() && steps.check_pr.outputs.is_pr == 'true'
201+
working-directory: ./training
202+
shell: bash
203+
run: |
204+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
205+
env:
206+
GH_TOKEN: ${{ inputs.gh-token }}
207+
208+
- name: Send Discord notification for failure
209+
if: failure() && steps.check_pr.outputs.is_pr == 'false'
210+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
211+
with:
212+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
213+
status: ${{ job.status }}
214+
title: "e2e-nvidia-l40s-x4"
215+
description: |
216+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
217+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
218+
color: 0xCB2431 # Red color for failure
219+
220+
- name: Send Discord notification for success
221+
if: success() && steps.check_pr.outputs.is_pr == 'false'
222+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
223+
with:
224+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
225+
status: ${{ job.status }}
226+
title: "e2e-nvidia-l40s-x4"
227+
description: |
228+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
229+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
230+
color: 0x28A745 # Green color for success

0 commit comments

Comments
 (0)