Skip to content

Commit 3897137

Browse files
committed
chore(ci): fallback on permanent h100 instance on shortage
When a shortage occurs on n3-H100x1 instances on Hyperstack, we'll fall back on the permanent one registered on GitHub. This can be done by using 'h100x1' as runner label to run a job on it.
1 parent 3988c85 commit 3897137

7 files changed

+120
-21
lines changed

.github/workflows/benchmark_gpu_core_crypto.yml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,16 @@ jobs:
2323
if: github.event_name != 'schedule' ||
2424
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
2525
outputs:
26-
runner-name: ${{ steps.start-instance.outputs.label }}
26+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
27+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
28+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
29+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
30+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
31+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
2732
steps:
28-
- name: Start instance
29-
id: start-instance
33+
- name: Start remote instance
34+
id: start-remote-instance
35+
continue-on-error: true
3036
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
3137
with:
3238
mode: start
@@ -36,6 +42,13 @@ jobs:
3642
backend: hyperstack
3743
profile: single-h100
3844

45+
# This will allow to fallback on permanent instances running on Hyperstack.
46+
- name: Use permanent remote instance
47+
id: use-permanent-instance
48+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
49+
run: |
50+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
51+
3952
cuda-core-crypto-benchmarks:
4053
name: Execute GPU core crypto benchmarks
4154
needs: setup-instance
@@ -57,6 +70,7 @@ jobs:
5770
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
5871

5972
- name: Setup Hyperstack dependencies
73+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
6074
uses: ./.github/actions/gpu_setup
6175
with:
6276
cuda-version: ${{ matrix.cuda }}
@@ -128,7 +142,7 @@ jobs:
128142

129143
teardown-instance:
130144
name: Teardown instance (cuda-integer-full-benchmarks)
131-
if: ${{ always() && needs.setup-instance.result == 'success' }}
145+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
132146
needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
133147
runs-on: ubuntu-latest
134148
steps:

.github/workflows/benchmark_gpu_erc20_common.yml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,16 @@ jobs:
5050
if: github.event_name == 'workflow_dispatch' ||
5151
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
5252
outputs:
53-
runner-name: ${{ steps.start-instance.outputs.label }}
53+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
54+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
55+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
56+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
57+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
58+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
5459
steps:
55-
- name: Start instance
56-
id: start-instance
60+
- name: Start remote instance
61+
id: start-remote-instance
62+
continue-on-error: true
5763
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
5864
with:
5965
mode: start
@@ -63,6 +69,13 @@ jobs:
6369
backend: ${{ inputs.backend }}
6470
profile: ${{ inputs.profile }}
6571

72+
# This will allow to fallback on permanent instances running on Hyperstack.
73+
- name: Use permanent remote instance
74+
id: use-permanent-instance
75+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
76+
run: |
77+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
78+
6679
cuda-erc20-benchmarks:
6780
name: Cuda ERC20 benchmarks (${{ inputs.profile }})
6881
needs: setup-instance
@@ -84,6 +97,7 @@ jobs:
8497
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
8598

8699
- name: Setup Hyperstack dependencies
100+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
87101
uses: ./.github/actions/gpu_setup
88102
with:
89103
cuda-version: ${{ matrix.cuda }}
@@ -154,7 +168,7 @@ jobs:
154168

155169
teardown-instance:
156170
name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
157-
if: ${{ always() && needs.setup-instance.result == 'success' }}
171+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
158172
needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
159173
runs-on: ubuntu-latest
160174
steps:

.github/workflows/benchmark_gpu_integer_common.yml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,16 @@ jobs:
114114
needs: prepare-matrix
115115
runs-on: ubuntu-latest
116116
outputs:
117-
runner-name: ${{ steps.start-instance.outputs.label }}
117+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
118+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
119+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
120+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
121+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
122+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
118123
steps:
119-
- name: Start instance
120-
id: start-instance
124+
- name: Start remote instance
125+
id: start-remote-instance
126+
continue-on-error: true
121127
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
122128
with:
123129
mode: start
@@ -127,6 +133,13 @@ jobs:
127133
backend: ${{ inputs.backend }}
128134
profile: ${{ inputs.profile }}
129135

136+
# This will allow to fallback on permanent instances running on Hyperstack.
137+
- name: Use permanent remote instance
138+
id: use-permanent-instance
139+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
140+
run: |
141+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
142+
130143
cuda-benchmarks:
131144
name: Cuda benchmarks (${{ inputs.profile }})
132145
needs: [ prepare-matrix, setup-instance ]
@@ -154,6 +167,7 @@ jobs:
154167
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
155168

156169
- name: Setup Hyperstack dependencies
170+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
157171
uses: ./.github/actions/gpu_setup
158172
with:
159173
cuda-version: ${{ matrix.cuda }}
@@ -230,7 +244,7 @@ jobs:
230244

231245
teardown-instance:
232246
name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
233-
if: ${{ always() && needs.setup-instance.result == 'success' }}
247+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
234248
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
235249
runs-on: ubuntu-latest
236250
steps:

.github/workflows/gpu_fast_h100_tests.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,17 @@ jobs:
6868
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
6969
runs-on: ubuntu-latest
7070
outputs:
71-
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
71+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
72+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
73+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
74+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
75+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
76+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
7277
steps:
7378
- name: Start remote instance
7479
id: start-remote-instance
7580
if: env.SECRETS_AVAILABLE == 'true'
81+
continue-on-error: true
7682
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
7783
with:
7884
mode: start
@@ -82,6 +88,13 @@ jobs:
8288
backend: hyperstack
8389
profile: single-h100
8490

91+
# This will allow to fallback on permanent instances running on Hyperstack.
92+
- name: Use permanent remote instance
93+
id: use-permanent-instance
94+
if: ${{ env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure' }}
95+
run: |
96+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
97+
8598
# This instance will be spawned especially for pull-request from forked repository
8699
- name: Start GitHub instance
87100
id: start-github-instance
@@ -114,6 +127,7 @@ jobs:
114127
token: ${{ env.CHECKOUT_TOKEN }}
115128

116129
- name: Setup Hyperstack dependencies
130+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
117131
uses: ./.github/actions/gpu_setup
118132
with:
119133
cuda-version: ${{ matrix.cuda }}
@@ -159,7 +173,7 @@ jobs:
159173

160174
teardown-instance:
161175
name: Teardown instance (cuda-h100-tests)
162-
if: ${{ always() && needs.setup-instance.result == 'success' }}
176+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
163177
needs: [ setup-instance, cuda-tests-linux ]
164178
runs-on: ubuntu-latest
165179
steps:

.github/workflows/gpu_full_h100_tests.yml

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,16 @@ jobs:
2020
name: Setup instance (cuda-h100-tests)
2121
runs-on: ubuntu-latest
2222
outputs:
23-
runner-name: ${{ steps.start-instance.outputs.label }}
23+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
24+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
25+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
26+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
27+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
28+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
2429
steps:
25-
- name: Start instance
26-
id: start-instance
30+
- name: Start remote instance
31+
id: start-remote-instance
32+
continue-on-error: true
2733
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
2834
with:
2935
mode: start
@@ -33,6 +39,13 @@ jobs:
3339
backend: hyperstack
3440
profile: single-h100
3541

42+
# This will allow to fallback on permanent instances running on Hyperstack.
43+
- name: Use permanent remote instance
44+
id: use-permanent-instance
45+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
46+
run: |
47+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
48+
3649
cuda-tests-linux:
3750
name: CUDA H100 tests
3851
needs: [ setup-instance ]
@@ -68,6 +81,7 @@ jobs:
6881
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
6982

7083
- name: Setup Hyperstack dependencies
84+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
7185
uses: ./.github/actions/gpu_setup
7286
with:
7387
cuda-version: ${{ matrix.cuda }}
@@ -109,6 +123,7 @@ jobs:
109123

110124
teardown-instance:
111125
name: Teardown instance (cuda-h100-tests)
126+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
112127
needs: [ setup-instance, cuda-tests-linux ]
113128
runs-on: ubuntu-latest
114129
steps:

.github/workflows/gpu_signed_integer_h100_tests.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,17 @@ jobs:
6969
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
7070
runs-on: ubuntu-latest
7171
outputs:
72-
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
72+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
73+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
74+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
75+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
76+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
77+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
7378
steps:
7479
- name: Start remote instance
7580
id: start-remote-instance
7681
if: env.SECRETS_AVAILABLE == 'true'
82+
continue-on-error: true
7783
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
7884
with:
7985
mode: start
@@ -83,6 +89,13 @@ jobs:
8389
backend: hyperstack
8490
profile: single-h100
8591

92+
# This will allow to fallback on permanent instances running on Hyperstack.
93+
- name: Use permanent remote instance
94+
id: use-permanent-instance
95+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
96+
run: |
97+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
98+
8699
# This instance will be spawned especially for pull-request from forked repository
87100
- name: Start GitHub instance
88101
id: start-github-instance
@@ -115,6 +128,7 @@ jobs:
115128
token: ${{ env.CHECKOUT_TOKEN }}
116129

117130
- name: Setup Hyperstack dependencies
131+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
118132
uses: ./.github/actions/gpu_setup
119133
with:
120134
cuda-version: ${{ matrix.cuda }}
@@ -146,7 +160,7 @@ jobs:
146160

147161
teardown-instance:
148162
name: Teardown instance (cuda-h100-tests)
149-
if: ${{ always() && needs.setup-instance.result == 'success' }}
163+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
150164
needs: [ setup-instance, cuda-tests-linux ]
151165
runs-on: ubuntu-latest
152166
steps:

.github/workflows/gpu_unsigned_integer_h100_tests.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,17 @@ jobs:
6868
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
6969
runs-on: ubuntu-latest
7070
outputs:
71-
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
71+
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
72+
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
73+
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
74+
# otherwise we'll try to run the next job on a non-existing on-demand instance.
75+
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
76+
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
7277
steps:
7378
- name: Start remote instance
7479
id: start-remote-instance
7580
if: env.SECRETS_AVAILABLE == 'true'
81+
continue-on-error: true
7682
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
7783
with:
7884
mode: start
@@ -82,6 +88,13 @@ jobs:
8288
backend: hyperstack
8389
profile: single-h100
8490

91+
# This will allow to fallback on permanent instances running on Hyperstack.
92+
- name: Use permanent remote instance
93+
id: use-permanent-instance
94+
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
95+
run: |
96+
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
97+
8598
# This instance will be spawned especially for pull-request from forked repository
8699
- name: Start GitHub instance
87100
id: start-github-instance
@@ -114,6 +127,7 @@ jobs:
114127
token: ${{ env.CHECKOUT_TOKEN }}
115128

116129
- name: Setup Hyperstack dependencies
130+
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
117131
uses: ./.github/actions/gpu_setup
118132
with:
119133
cuda-version: ${{ matrix.cuda }}
@@ -145,7 +159,7 @@ jobs:
145159

146160
teardown-instance:
147161
name: Teardown instance (cuda-h100-tests)
148-
if: ${{ always() && needs.setup-instance.result == 'success' }}
162+
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
149163
needs: [ setup-instance, cuda-tests-linux ]
150164
runs-on: ubuntu-latest
151165
steps:

0 commit comments

Comments
 (0)