Skip to content

E2E (NVIDIA L40S x4) (python 3.11) #551

E2E (NVIDIA L40S x4) (python 3.11)

E2E (NVIDIA L40S x4) (python 3.11) #551

# SPDX-License-Identifier: Apache-2.0
name: E2E (NVIDIA L40S x4) (python 3.11)
on:
schedule:
- cron: '0 16 * * *' # Runs at 4PM UTC every day
workflow_dispatch:
inputs:
pr_or_branch:
description: 'pull request number or branch name'
required: true
default: 'main'
env:
TMPDIR: /home/tmp
jobs:
start-large-ec2-runner:
if: github.repository_owner == 'instructlab'
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback
- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g6e.12xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
e2e-large-test:
needs:
- start-large-ec2-runner
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
permissions:
pull-requests: write
steps:
- name: Checkout instructlab/training
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0
- name: Run e2e tests
uses: ./training/.github/actions/run-e2e
with:
python-version: 3.11
gh-token: ${{ secrets.GITHUB_TOKEN }}
hf-token: ${{ secrets.HF_TOKEN }}
openai-api-key: ${{ secrets.OPENAI_API_KEY }}
son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- e2e-large-test
runs-on: ubuntu-latest
if: ${{ always() && github.repository_owner == 'instructlab' }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
loss-graphs:
needs:
- stop-large-ec2-runner
runs-on: ubuntu-latest
if: ${{ always() && github.repository_owner == 'instructlab' }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}
- name: Download loss data Phase 1
id: phase-1-download-logs
uses: actions/download-artifact@v4
with:
name: phase-1-training-log.jsonl
path: downloaded-data
- name: Download loss data Phase 2
id: phase-2-download-logs
uses: actions/download-artifact@v4
with:
name: phase-2-training-log.jsonl
path: downloaded-data
- name: Checkout instructlab/training
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
fetch-depth: 0
- name: Install dependencies
working-directory: ./training
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt -c constraints-dev.txt
- name: Try to upload Phase 1 to s3
id: phase-1-upload-s3
continue-on-error: true
run: |
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
--output-file "./phase-1-test.md" \
--phase "1" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"
- name: Try to upload Phase 2 to s3
id: phase-2-upload-s3
continue-on-error: true
run: |
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
--output-file "./phase-2-test.md" \
--phase "2" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"
- name: Check Phase 1 S3 upload status for success
if: steps.phase-1-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 1 loss graph to S3."
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 2 S3 upload status for success
if: steps.phase-2-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 2 loss graph to S3."
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 1 S3 upload status for failure
if: steps.phase-1-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 2 S3 upload status for failure
if: steps.phase-2-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"