Skip to content

Add MLflow support and expose logging configuration in TrainingArgs #287

Add MLflow support and expose logging configuration in TrainingArgs

Add MLflow support and expose logging configuration in TrainingArgs #287

Workflow file for this run

# SPDX-License-Identifier: Apache-2.0
name: "Run smoke tests via Tox::pytest (python 3.12)"
# These tests will be long running and require accelerated hardware.
on:
workflow_dispatch:
inputs:
branch:
type: string
default: main
# using this rather than pull_request because this workflow
# needs to run in the context of the base branch (main) and
# access the repo's secrets to start the AWS instances.
pull_request_target:
branches:
- main
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- "**.py"
- "tox.ini"
- "pyproject.toml"
- "requirements-dev.txt"
- "requirements-cuda.txt"
- "constraints-dev.txt"
permissions:
contents: read
defaults:
run:
shell: bash
env:
ec2_runner_variant: "g6e.12xlarge" # 4x L40s
jobs:
start-large-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback
- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g6e.12xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
run-smoke-tests:
needs:
- start-large-ec2-runner
runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
# It is important that this job has no write permissions and has
# no access to any secrets. This part is where we are running
# untrusted code from PRs.
permissions: {}
steps:
- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
ref: ${{inputs.branch}}
- name: Run smoke tests
uses: ./.github/actions/run-smoke
with:
python-version: 3.12
stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- run-smoke-tests
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
- name: "Stop EC2 runner"
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}