diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
index b673da5adf..acdbe9cb06 100644
--- a/.github/workflows/integration_test_8gpu_models.yaml
+++ b/.github/workflows/integration_test_8gpu_models.yaml
@@ -3,6 +3,8 @@ name: 8 GPU Model Tests
 on:
   push:
     branches: [ main ]
+    tags:
+      - ciflow/8gpu/*
     paths-ignore:
       - 'torchtitan/experiments/**'
   pull_request:
@@ -21,18 +23,30 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
+permissions:
+      id-token: write
+      contents: read
+
 jobs:
+  # Step 1: Dynamically compute the matrix based on conditions
+  set-matrix:
+    uses: ./.github/workflows/set-matrix.yaml
+
+  # Step 2: Use the dynamic matrix in the build-test job
   build-test:
+    needs: set-matrix
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 
@@ -46,12 +60,14 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
+        sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
 
-        mkdir artifacts-to-be-uploaded
-        python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
-        python -m tests.integration_tests.flux artifacts-to-be-uploaded/flux --ngpu 8
-        rm -rf artifacts-to-be-uploaded/*/checkpoint
-        rm -rf artifacts-to-be-uploaded/flux/*/inference_results/
+        python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite models $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests.flux $RUNNER_TEMP/artifacts-to-be-uploaded/flux --ngpu 8
+        rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
+        rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/flux/*/inference_results/
diff --git a/.github/workflows/integration_test_8gpu_torchft.yaml b/.github/workflows/integration_test_8gpu_torchft.yaml
index 23f59d8bba..0931fa75b7 100644
--- a/.github/workflows/integration_test_8gpu_torchft.yaml
+++ b/.github/workflows/integration_test_8gpu_torchft.yaml
@@ -3,6 +3,8 @@ name: TorchFT 8 GPU Integration Test
 on:
   push:
     branches: [ main ]
+    tags:
+      - ciflow/8gpu/*
     paths:
       - 'torchtitan/components/ft.py'
       - '.github/workflows/integration_test_8gpu_torchft.yaml'
@@ -21,18 +23,30 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
+permissions:
+      id-token: write
+      contents: read
+
 jobs:
+  # Step 1: Dynamically compute the matrix based on conditions
+  set-matrix:
+    uses: ./.github/workflows/set-matrix.yaml
+
+  # Step 2: Use the dynamic matrix in the build-test job
   build-test:
+    needs: set-matrix
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 
@@ -47,14 +61,16 @@ jobs:
         pip config --user set global.progress_bar off
 
         python -m pip install torchft-nightly
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
+
+        sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
+        sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
 
-        mkdir artifacts-to-be-uploaded
         echo "torchft_lighthouse"
         RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
         echo "ft_integration_test"
         # Getting error - Cuda failure 217 'peer access is not supported between these two devices'
-        python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8
+        python -m tests.integration_tests.ft $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
         # pkill -9 torchft_lighthouse
-        rm -rf artifacts-to-be-uploaded/*/checkpoint
+        rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml
index 5564d8d70b..3d6704641c 100644
--- a/.github/workflows/set-matrix.yaml
+++ b/.github/workflows/set-matrix.yaml
@@ -27,9 +27,9 @@ jobs:
             "name": "rocm",
             "runner": "linux.rocm.gpu.gfx942.8",
             "gpu-arch-type": "rocm",
-            "gpu-arch-version": "7.0",
+            "gpu-arch-version": "7.1",
             "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12",
-            "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0"
+            "index-url": "https://download.pytorch.org/whl/nightly/rocm7.1"
           }'
 
           # Define CUDA matrix
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
index 8e16ecb4fb..3662aa6bf6 100755
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -121,7 +121,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             ],
             "Checkpoint Integration Test - save load model only checkpoint in HF definition and format",
             "model_only_hf_checkpoint",
-            skip_rocm_test=True,
         ),
         OverrideDefinitions(
             [