@@ -114,10 +114,16 @@ jobs:
114114 needs : prepare-matrix
115115 runs-on : ubuntu-latest
116116 outputs :
117- runner-name : ${{ steps.start-instance.outputs.label }}
117+ # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
118+ # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
119+ # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
120+ # otherwise we'll try to run the next job on a non-existing on-demand instance.
121+ runner-name : ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
122+ remote-instance-outcome : ${{ steps.start-remote-instance.outcome }}
118123 steps :
119- - name : Start instance
120- id : start-instance
124+ - name : Start remote instance
125+ id : start-remote-instance
126+ continue-on-error : true
121127 uses : zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
122128 with :
123129 mode : start
@@ -127,6 +133,13 @@ jobs:
127133 backend : ${{ inputs.backend }}
128134 profile : ${{ inputs.profile }}
129135
136+ # This will allow to fallback on permanent instances running on Hyperstack.
137+ - name : Use permanent remote instance
138+ id : use-permanent-instance
139+ if : ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
140+ run : |
141+ echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
142+
130143 cuda-benchmarks :
131144 name : Cuda benchmarks (${{ inputs.profile }})
132145 needs : [ prepare-matrix, setup-instance ]
@@ -154,6 +167,7 @@ jobs:
154167 token : ${{ secrets.REPO_CHECKOUT_TOKEN }}
155168
156169 - name : Setup Hyperstack dependencies
170+ if : needs.setup-instance.outputs.remote-instance-outcome == 'success'
157171 uses : ./.github/actions/gpu_setup
158172 with :
159173 cuda-version : ${{ matrix.cuda }}
@@ -230,7 +244,7 @@ jobs:
230244
231245 teardown-instance :
232246 name : Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
233- if : ${{ always() && needs.setup-instance.result == 'success' }}
247+ if : ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
234248 needs : [ setup-instance, cuda-benchmarks, slack-notify ]
235249 runs-on : ubuntu-latest
236250 steps :
0 commit comments