chore(ci): fallback on permanent h100 instance on shortage

soonum · soonum · commit 3897137a3ffc · 2025-03-03T11:38:32.000+01:00
When a shortage occurs on n3-H100x1 instances on Hyperstack, we'll
fall back on the permanent one registered on GitHub.
This can be done by using 'h100x1' as runner label to run a job on
it.
diff --git a/.github/workflows/benchmark_gpu_core_crypto.yml b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -23,10 +23,16 @@ jobs:
     if: github.event_name != 'schedule' ||
       (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -36,6 +42,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-core-crypto-benchmarks:
     name: Execute GPU core crypto benchmarks
     needs: setup-instance
@@ -57,6 +70,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -128,7 +142,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/benchmark_gpu_erc20_common.yml b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -50,10 +50,16 @@ jobs:
     if:  github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -63,6 +69,13 @@ jobs:
           backend: ${{ inputs.backend }}
           profile: ${{ inputs.profile }}
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-erc20-benchmarks:
     name: Cuda ERC20 benchmarks (${{ inputs.profile }})
     needs: setup-instance
@@ -84,6 +97,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -154,7 +168,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/benchmark_gpu_integer_common.yml b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -114,10 +114,16 @@ jobs:
     needs: prepare-matrix
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -127,6 +133,13 @@ jobs:
           backend: ${{ inputs.backend }}
           profile: ${{ inputs.profile }}
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-benchmarks:
     name: Cuda benchmarks (${{ inputs.profile }})
     needs: [ prepare-matrix, setup-instance ]
@@ -154,6 +167,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -230,7 +244,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml
@@ -68,11 +68,17 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -82,6 +88,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -114,6 +127,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -159,7 +173,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/gpu_full_h100_tests.yml b/.github/workflows/gpu_full_h100_tests.yml
@@ -20,10 +20,16 @@ jobs:
     name: Setup instance (cuda-h100-tests)
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -33,6 +39,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-tests-linux:
     name: CUDA H100 tests
     needs: [ setup-instance ]
@@ -68,6 +81,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -109,6 +123,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/gpu_signed_integer_h100_tests.yml b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -69,11 +69,17 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -83,6 +89,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -115,6 +128,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -146,7 +160,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/gpu_unsigned_integer_h100_tests.yml b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -68,11 +68,17 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -82,6 +88,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -114,6 +127,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -145,7 +159,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps: