diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
new file mode 100644
index 000000000..66c3443c6
--- /dev/null
+++ b/.github/workflows/rocm-ci.yml
@@ -0,0 +1,348 @@
+# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+
+name: TransformerEngine CI
+
+on:
+  push:
+    branches:
+      - 'dev'
+      - 'release_v1.*_rocm'
+      - 'release_v2.*_rocm'
+  pull_request:
+    branches:
+      - 'dev'
+      - 'release_v1.**_rocm'
+      - 'release_v2.**_rocm'
+  workflow_dispatch:
+    inputs:
+      test_level:
+        description: 'Test Level (1-3)'
+        required: true
+        default: '3'
+      skip_dev_merge:
+        description: 'Skip merging dev branch'
+        type: boolean
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_and_test:
+    name: Build and Test on GPU
+    timeout-minutes: 720
+    runs-on: linux-mi325-8
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+          fetch-depth: 0
+        
+      - name: Merge origin/dev
+        # Only run on PRs targeting dev, or manual runs where we didn't skip  it
+        if: |
+          (github.event_name == 'pull_request' && github.base_ref == 'dev') ||
+          (github.event_name == 'workflow_dispatch' && inputs.skip_dev_merge != 'true' && github.ref == 'refs/heads/dev')
+        run: |
+          echo "Attempting to merge origin/dev..."
+          git config --global user.email "amd@amd.com"
+          git config --global user.name "AMD CI"
+          
+          # Fetch dev specifically
+          git fetch origin dev
+
+          # Attempt merge; this will exit with error code 1 if there is a conflict, failing the job
+          git merge origin/dev
+
+          # Update submodules after merge to ensure new files are present
+          echo "Updating submodules after merge..."
+          git submodule update --init --recursive
+
+          echo "Merge successful."
+
+      - name: Select Docker Image Tag
+        id: select-image
+        env:
+          DEV_IMAGE: ${{ vars.DEV_DOCKER_IMAGE }}
+          REL_IMAGE: ${{ vars.REL613_DOCKER_IMAGE }}
+        run: |
+          BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
+          echo "Determining image for branch: $BRANCH_NAME"
+          DEV_DOCKER_IMAGE="$DEV_IMAGE"
+          REL613_DOCKER_IMAGE="$REL_IMAGE"
+          IMAGE_TO_USE="$DEV_DOCKER_IMAGE"
+          if [[ $BRANCH_NAME =~ ^release_v([0-9]+)\.([0-9]+)_rocm$ ]]; then
+            MAJOR_VERSION=${BASH_REMATCH[1]}
+            MINOR_VERSION=${BASH_REMATCH[2]}
+            if (( MAJOR_VERSION == 1 )); then
+              if (( MINOR_VERSION == 13 || MINOR_VERSION == 14 )); then IMAGE_TO_USE="$REL613_DOCKER_IMAGE"; fi
+            fi
+          fi
+          echo "Selected image: $IMAGE_TO_USE"
+          echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
+
+      - name: Pull Docker Image
+        run: |
+          docker pull ${{ steps.select-image.outputs.image-tag }}
+
+      - name: Run Container
+        run: |
+          docker run -dt \
+            --name te-runner \
+            --network=host \
+            --device=/dev/dri --device=/dev/kfd \
+            --shm-size=16G \
+            --pid=host \
+            --group-add $(getent group render | cut -d: -f3) \
+            --group-add $(getent group video | cut -d: -f3) \
+            -v "${{ github.workspace }}:/workspace" \
+            -w /workspace \
+            ${{ steps.select-image.outputs.image-tag}}
+
+      - name: Diagnostics
+        run: |
+          # On the runner
+          rocm-smi
+          # In the container
+          docker exec te-runner rocm-smi
+
+      - name: Determine GPU Architecture via rocminfo
+        id: gpu-arch
+        run: |
+          # Run rocminfo inside the container and capture the output
+          ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'")
+          if [ -z "$ARCH" ]; then
+            echo "::error::Could not determine GPU architecture using rocminfo inside the container."
+            # Optional: Print full rocminfo output for debugging
+            docker exec te-runner rocminfo
+            exit 1
+          fi
+          echo "Detected GPU Arch: $ARCH"
+          echo "arch=$ARCH" >> $GITHUB_OUTPUT
+
+      - name: Build Project
+        run: |
+          docker exec \
+            -e GPU_ARCH=${{ steps.gpu-arch.outputs.arch }} \
+            te-runner bash -c "$(cat <<'EOF'
+          set -ex
+          
+          export HIP_PATH=""
+          export PYTORCH_ROCM_ARCH=$GPU_ARCH
+          export NVTE_ROCM_ARCH=$GPU_ARCH
+          export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
+          pip install ninja
+          pip install --no-build-isolation -v . 2>&1
+          EOF
+          )"
+
+      - name: Run sGPU tests
+        id: sgpu-tests
+        continue-on-error: true
+        run: |
+          # Cleanup previous failure markers if any. Don't actually do anything on k8s pods
+          rm -f FAIL_*
+
+          docker exec \
+            -e TEST_SGPU=1 \
+            -e TEST_LEVEL=${{ inputs.test_level || '3' }} \
+            te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          # debug output
+          ls -d /opt/rocm*
+          python --version
+          pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"
+
+          HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
+          torch_pid=$!; echo Pytorch test pid $!
+          
+          HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
+          jax_pid=$!; echo JAX test pid $!
+          
+          HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
+          core_pid=$!; echo Core test pid $!
+          
+          wait $core_pid; core_rc=$?
+          wait $jax_pid; jax_rc=$?
+          wait $torch_pid; torch_rc=$?
+          
+          # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
+          # Check PyTorch
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch sGPU Log"
+            cat /workspace/torch_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch sGPU test FAILED."
+            touch /workspace/FAIL_TORCH_SGPU
+          fi
+
+          # Check JAX
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX sGPU Log"
+            cat /workspace/jax_sgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX sGPU test FAILED."
+            touch /workspace/FAIL_JAX_SGPU
+          fi
+
+          # Check Core
+          if [ $core_rc -ne 0 ]; then 
+            echo "::group::[FAILED] Core sGPU Log"
+            cat /workspace/core_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Core sGPU test FAILED."
+            touch /workspace/FAIL_CORE_SGPU
+          fi
+          
+          test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
+          EOF
+          )"
+          
+          # Export failed tests statuses to host runner
+          if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_JAX_SGPU ];   then echo "jax=fail"   >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_CORE_SGPU ];  then echo "core=fail"  >> $GITHUB_OUTPUT; fi
+
+      - name: Run mGPU tests
+        id: mgpu-tests
+        continue-on-error: true
+        run: |
+          docker exec \
+            -e TEST_MGPU=1 \
+            -e TEST_LEVEL=${{ inputs.test_level || '3' }} \
+            te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+          
+          # Run PyTorch
+          ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
+          torch_rc=$?
+          
+          # Run JAX
+          ci/jax.sh > /workspace/jax_mgpu.log 2>&1
+          jax_rc=$?
+          
+          # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch mGPU Log"
+            cat /workspace/torch_mgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch mGPU test FAILED."
+            touch /workspace/FAIL_TORCH_MGPU
+          fi
+
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX mGPU Log"
+            cat /workspace/jax_mgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX mGPU test FAILED."
+            touch /workspace/FAIL_JAX_MGPU
+          fi
+          
+          test $torch_rc -eq 0 -a $jax_rc -eq 0
+          EOF
+          )"
+
+          # Export failed tests statuses to host runner
+          if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_JAX_MGPU ];   then echo "jax=fail"   >> $GITHUB_OUTPUT; fi
+
+      - name: Run Examples
+        id: examples-tests
+        continue-on-error: true
+        run: |
+          docker exec te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -ex -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          cd /workspace/examples/pytorch/mnist
+          python main.py 2>&1 | tee /workspace/examples.log
+          python main.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          
+          cd /workspace/examples/jax/mnist
+          pip3 install -r requirements.txt
+          python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          
+          cd /workspace/examples/jax/encoder
+          pip3 install -r requirements.txt
+          python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          EOF
+          )"
+
+      - name: Check Test Failure Status
+        if: always()
+        run: |
+          EXIT_STATUS=0
+
+          # Check outcomes of the specific test steps
+          # "outcome" will be 'failure' even if continue-on-error was true
+          # sGPU CHECKS
+          if [[ "${{ steps.sgpu-tests.outputs.core }}" == "fail" ]]; then
+            echo "::error::Core sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ "${{ steps.sgpu-tests.outputs.torch }}" == "fail" ]]; then
+            echo "::error::PyTorch sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ "${{ steps.sgpu-tests.outputs.jax }}" == "fail" ]]; then
+            echo "::error::JAX sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # mGPU CHECKS
+          if [[ "${{ steps.mgpu-tests.outputs.torch }}" == "fail" ]]; then
+            echo "::error::PyTorch mGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ "${{ steps.mgpu-tests.outputs.jax }}" == "fail" ]]; then
+            echo "::error::JAX mGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # EXAMPLES CHECK
+          if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
+            echo "::error::Example Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # Fail the job if any errors were detected
+          if [[ "$EXIT_STATUS" == "1" ]]; then
+            exit 1
+          fi
+
+      - name: Copy logs and reports from container
+        if: always()
+        run: |
+          docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true
+          docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true
+          docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true
+          docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true
+          docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true
+
+      - name: Upload logs and test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: logs-and-reports
+          path: |
+            *.log
+          if-no-files-found: ignore
+          retention-days: 5
+
+      - name: Cleanup container
+        if: always()
+        run: docker rm -f te-runner || true