Skip to content

Commit c362aad

Browse files
chore: Add EC2 fallback logic CI-ACTION to medium + large E2E jobs
Our large E2E job is failing presently because it is unable to find an instance: https://github.com/instructlab/eval/actions/runs/15564451460 I'm also updating the medium E2E job to use the same action, as a precaution. Signed-off-by: Courtney Pacheco <[email protected]>
1 parent de23ebb commit c362aad

File tree

2 files changed

+101
-39
lines changed

2 files changed

+101
-39
lines changed

.github/workflows/e2e-nvidia-l4-x1.yml

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,28 +41,59 @@ jobs:
4141
start-medium-ec2-runner:
4242
runs-on: ubuntu-latest
4343
outputs:
44-
label: ${{ steps.start-ec2-runner.outputs.label }}
45-
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
44+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
45+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
46+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
4647
steps:
47-
- name: Configure AWS credentials
48-
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
48+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
49+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
4950
with:
50-
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
51-
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
52-
aws-region: ${{ vars.AWS_REGION }}
53-
54-
- name: Start EC2 runner
55-
id: start-ec2-runner
56-
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
51+
repository: instructlab/ci-actions
52+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of
53+
# overwriting the current WORKDIR contents
54+
path: ci-actions
55+
ref: release-v0.2
56+
sparse-checkout: |
57+
actions/launch-ec2-runner-with-fallback
58+
59+
- name: Launch EC2 Runner with Fallback
60+
id: launch-ec2-instance-with-fallback
61+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
62+
env:
63+
TMPDIR: "/tmp"
5764
with:
58-
mode: start
59-
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
60-
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
61-
ec2-instance-type: g6.8xlarge
62-
subnet-id: subnet-02d230cffd9385bd4
63-
security-group-id: sg-06300447c4a5fbef3
64-
iam-role-name: instructlab-ci-runner
65-
aws-resource-tags: >
65+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
66+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
67+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
68+
regions_config: >
69+
[
70+
{
71+
"region": "us-east-2",
72+
"subnets": {
73+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
74+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
75+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
76+
},
77+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
78+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
79+
},
80+
{
81+
"region": "us-east-1",
82+
"subnets": {
83+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
84+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
85+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
86+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
87+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
88+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
89+
},
90+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
91+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
92+
}
93+
]
94+
try_spot_instance_first: false
95+
ec2_instance_type: g6.8xlarge
96+
aws_resource_tags: >
6697
[
6798
{"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
6899
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
@@ -155,7 +186,7 @@ jobs:
155186
with:
156187
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
157188
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
158-
aws-region: ${{ vars.AWS_REGION }}
189+
aws-region: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-region }}
159190
- name: Stop EC2 runner
160191
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
161192
with:

.github/workflows/e2e-nvidia-l40s-x4.yml

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,28 +19,59 @@ jobs:
1919
start-large-ec2-runner:
2020
runs-on: ubuntu-latest
2121
outputs:
22-
label: ${{ steps.start-ec2-runner.outputs.label }}
23-
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
22+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
23+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
24+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
2425
steps:
25-
- name: Configure AWS credentials
26-
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
26+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
27+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
2728
with:
28-
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
29-
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
30-
aws-region: ${{ vars.AWS_REGION }}
29+
repository: instructlab/ci-actions
30+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of
31+
# overwriting the current WORKDIR contents
32+
path: ci-actions
33+
ref: release-v0.2
34+
sparse-checkout: |
35+
actions/launch-ec2-runner-with-fallback
3136
32-
- name: Start EC2 runner
33-
id: start-ec2-runner
34-
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
37+
- name: Launch EC2 Runner with Fallback
38+
id: launch-ec2-instance-with-fallback
39+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
40+
env:
41+
TMPDIR: "/tmp"
3542
with:
36-
mode: start
37-
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
38-
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
39-
ec2-instance-type: g6e.12xlarge
40-
subnet-id: subnet-024298cefa3bedd61
41-
security-group-id: sg-06300447c4a5fbef3
42-
iam-role-name: instructlab-ci-runner
43-
aws-resource-tags: >
43+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
44+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
45+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
46+
regions_config: >
47+
[
48+
{
49+
"region": "us-east-2",
50+
"subnets": {
51+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
52+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
53+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
54+
},
55+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
56+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
57+
},
58+
{
59+
"region": "us-east-1",
60+
"subnets": {
61+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
62+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
63+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
64+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
65+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
66+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
67+
},
68+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
69+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
70+
}
71+
]
72+
try_spot_instance_first: false
73+
ec2_instance_type: g6e.12xlarge
74+
aws_resource_tags: >
4475
[
4576
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
4677
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
@@ -235,7 +266,7 @@ jobs:
235266
with:
236267
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
237268
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
238-
aws-region: ${{ vars.AWS_REGION }}
269+
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
239270

240271
- name: Stop EC2 runner
241272
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1

0 commit comments

Comments
 (0)