Skip to content

Commit 167058a

Browse files
committed
ci: Introduce python 3.12 e2e large job flavor
Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
1 parent c9e07e9 commit 167058a

File tree

3 files changed

+489
-212
lines changed

3 files changed

+489
-212
lines changed

.github/actions/run-e2e/action.yml

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
name: 'Run e2e tests'
2+
description: 'Runs e2e tests'
3+
inputs:
4+
python-version:
5+
required: true
6+
description: >-
7+
Python version to use. Must be in the form of "3.xx".
8+
gh-token:
9+
required: true
10+
description: >-
11+
GitHub token to use for authentication.
12+
hf-token:
13+
required: true
14+
description: >-
15+
Hugging Face token to use for authentication.
16+
openai-api-key:
17+
required: true
18+
description: >-
19+
OpenAI API key to use for authentication.
20+
son-of-jeeves-token:
21+
required: true
22+
description: >-
23+
Son of Jeeves token (Slack).
24+
son-of-jeeves-discord-webhook:
25+
required: true
26+
description: >-
27+
Son of Jeeves webhook (Discord).
28+
runs:
29+
using: "composite"
30+
steps:
31+
- name: Install Packages
32+
shell: bash
33+
run: |
34+
cat /etc/os-release
35+
mkdir -p "${TMPDIR}"
36+
sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
37+
38+
- name: Checkout instructlab/instructlab
39+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
40+
with:
41+
repository: "instructlab/instructlab"
42+
path: "instructlab"
43+
# https://github.com/actions/checkout/issues/249
44+
fetch-depth: 0
45+
46+
- name: Determine if pr_or_branch is a PR number
47+
id: check_pr
48+
shell: bash
49+
run: |
50+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
51+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
52+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
53+
else
54+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
55+
fi
56+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
57+
58+
- name: Check if gh cli is installed
59+
id: gh_cli
60+
shell: bash
61+
run: |
62+
if command -v gh &> /dev/null ; then
63+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
64+
else
65+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
66+
fi
67+
68+
- name: Install gh CLI
69+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
70+
shell: bash
71+
run: |
72+
sudo dnf install 'dnf-command(config-manager)' -y
73+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
74+
sudo dnf install gh --repo gh-cli -y
75+
76+
- name: test gh CLI
77+
shell: bash
78+
run: |
79+
gh --version
80+
81+
- name: set default repo
82+
working-directory: ./training
83+
shell: bash
84+
run: |
85+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
86+
env:
87+
GH_TOKEN: ${{ inputs.gh-token }}
88+
89+
- name: Add comment to PR
90+
if: steps.check_pr.outputs.is_pr == 'true'
91+
working-directory: ./training
92+
shell: bash
93+
run: |
94+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
95+
env:
96+
GH_TOKEN: ${{ inputs.gh-token }}
97+
98+
- name: Fetch and checkout PR
99+
if: steps.check_pr.outputs.is_pr == 'true'
100+
working-directory: ./training
101+
shell: bash
102+
run: |
103+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
104+
env:
105+
GH_TOKEN: ${{ inputs.gh-token }}
106+
107+
- name: Checkout branch
108+
if: steps.check_pr.outputs.is_pr == 'false'
109+
working-directory: ./training
110+
shell: bash
111+
run: |
112+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
113+
114+
- name: Install ilab
115+
working-directory: ./instructlab
116+
shell: bash
117+
run: |
118+
PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh
119+
120+
- name: Update instructlab-training library
121+
working-directory: ./training
122+
shell: bash
123+
run: |
124+
. ../instructlab/venv/bin/activate
125+
pip install .
126+
pip install .[cuda]
127+
128+
- name: Check disk before tests
129+
if: always()
130+
shell: bash
131+
run: |
132+
df -h
133+
134+
- name: Run e2e test
135+
working-directory: ./instructlab
136+
env:
137+
HF_TOKEN: ${{ inputs.hf-token }}
138+
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
139+
shell: bash
140+
run: |
141+
. venv/bin/activate
142+
143+
# set preserve to true so we can retain the logs
144+
./scripts/e2e-ci.sh -lp
145+
146+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
147+
# and we know that it will be written into a directory created by `mktemp -d`.
148+
# Given this information, we can use the following command to find the file:
149+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
150+
phase_num=1;
151+
for log_file in $log_files; do
152+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
153+
((phase_num++))
154+
done
155+
156+
- name: Check disk after tests
157+
if: always()
158+
shell: bash
159+
run: |
160+
df -h
161+
162+
- name: Upload training logs Phase 1
163+
uses: actions/upload-artifact@v4
164+
with:
165+
name: phase-1-training-log.jsonl
166+
path: ./instructlab/phase-1-training-log.jsonl
167+
retention-days: 1
168+
overwrite: true
169+
170+
- name: Upload training logs Phase 2
171+
uses: actions/upload-artifact@v4
172+
with:
173+
name: phase-2-training-log.jsonl
174+
path: ./instructlab/phase-2-training-log.jsonl
175+
retention-days: 1
176+
overwrite: true
177+
178+
- name: Add comment to PR if the workflow failed
179+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
180+
working-directory: ./training
181+
shell: bash
182+
run: |
183+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
184+
env:
185+
GH_TOKEN: ${{ inputs.gh-token }}
186+
187+
- name: Add comment to PR if the workflow succeeded
188+
if: success() && steps.check_pr.outputs.is_pr == 'true'
189+
working-directory: ./training
190+
shell: bash
191+
run: |
192+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
193+
env:
194+
GH_TOKEN: ${{ inputs.gh-token }}
195+
196+
- name: Post job results to Slack if the workflow failed
197+
if: failure() && steps.check_pr.outputs.is_pr == 'false'
198+
id: slack-report-failure
199+
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 # v2.1.0
200+
with:
201+
token: ${{ inputs.son-of-jeeves-token }}
202+
method: chat.postMessage
203+
payload: |
204+
# Slack channel id, channel name, or user id to post message.
205+
# See also: https://api.slack.com/methods/chat.postMessage#channels
206+
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
207+
channel: 'e2e-ci-results'
208+
text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
209+
210+
- name: Post job results to Slack if the workflow succeeded
211+
if: success() && steps.check_pr.outputs.is_pr == 'false'
212+
id: slack-report-success
213+
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 # v2.1.0
214+
with:
215+
token: ${{ inputs.son-of-jeeves-token }}
216+
method: chat.postMessage
217+
payload: |
218+
# Slack channel id, channel name, or user id to post message.
219+
# See also: https://api.slack.com/methods/chat.postMessage#channels
220+
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
221+
channel: 'e2e-ci-results'
222+
text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
223+
224+
- name: Send Discord notification for failure
225+
if: failure() && steps.check_pr.outputs.is_pr == 'false'
226+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
227+
with:
228+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
229+
status: ${{ job.status }}
230+
title: "e2e-nvidia-l40s-x4"
231+
description: |
232+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
233+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
234+
color: 0xCB2431 # Red color for failure
235+
236+
- name: Send Discord notification for success
237+
if: success() && steps.check_pr.outputs.is_pr == 'false'
238+
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
239+
with:
240+
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
241+
status: ${{ job.status }}
242+
title: "e2e-nvidia-l40s-x4"
243+
description: |
244+
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
245+
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
246+
color: 0x28A745 # Green color for success

0 commit comments

Comments
 (0)