pytorch · akashveramd · Sep 8, 2025 · Oct 3, 2025
diff --git a/.github/workflows/linux_job_v2_rocm.yml b/.github/workflows/linux_job_v2_rocm.yml
@@ -0,0 +1,390 @@
+name: Build / Test on Linux v2
+
+on:
+  workflow_call:
+    inputs:
+      script:
+        description: 'Script to utilize'
+        default: "python setup.py bdist_wheel"
+        type: string
+      timeout:
+        description: 'Timeout for the job (in minutes)'
+        default: 30
+        type: number
+      runner:
+        description: 'Runner type to utilize'
+        default: "linux.2xlarge"
+        type: string
+      upload-artifact:
+        description: |
+          Name to give artifacts uploaded from ${RUNNER_ARTIFACT_DIR}, all the wheel files
+          under dist/ and any files under artifacts-to-be-uploaded/ will be uploaded
+        default: ''
+        type: string
+      upload-artifact-to-s3:
+        description: |
+          Upload the artifact to S3 instead of GitHub. This is used for large artifacts like
+          exported model
+        required: false
+        default: false
+        type: boolean
+      download-artifact:
+        description: 'Name to download artifacts to ${RUNNER_ARTIFACT_DIR}'
+        default: ''
+        type: string
+      repository:
+        description: 'Repository to checkout, defaults to ""'
+        default: ""
+        type: string
+      fetch-depth:
+        description: 'Number of commits to fetch, defaults to 1 similar to actions/checkout'
+        default: 1
+        type: number
+      submodules:
+        description:
+          Same as actions/checkout, set to `true` to checkout submodules or `recursive` to
+          recursively checkout everything
+        default: ""
+        type: string
+      ref:
+        description: 'Reference to checkout, defaults to "nightly"'
+        default: ""
+        type: string
+      test-infra-repository:
+        description: "Test infra repository to use"
+        default: "pytorch/test-infra"
+        type: string
+      test-infra-ref:
+        description: "Test infra reference to use"
+        default: ""
+        type: string
+      use-custom-docker-registry:
+        description: "Use the custom ECR registry. Applies only if build.sh exists in docker-build-dir."
+        default: true
+        type: boolean
+      docker-image:
+        description: Identifies the Docker image by name.
+        default: "pytorch/almalinux-builder"
+        type: string
+      docker-build-dir:
+        description: |
+          The directory containing the build.sh shell script to build the docker image.
+          The script parameters can be passed to docker build similar to how it is used
+          in PyTorch, i.e. build.sh "${IMAGE_NAME}" -t "${DOCKER_IMAGE}".
+        default: ".ci/docker"
+        type: string
+      gpu-arch-type:
+        description: "GPU arch type to use"
+        default: "cpu"
+        type: string
+      gpu-arch-version:
+        description: "GPU arch version to use"
+        default: ""
+        type: string
+      job-name:
+        description: "Name for the job, which is displayed in the GitHub UI"
+        default: "linux-job"
+        type: string
+      continue-on-error:
+        description: "Prevents a job from failing when a step fails. Set to true to allow a job to pass when exec script step fails."
+        default: false
+        type: boolean
+      binary-matrix:
+        description: "If we are calling this workflow with binary build matrix entry, will initialize matrix entries and env vars"
+        required: false
+        default: ''
+        type: string
+      run-with-docker:
+        description: "Whether the provided script should run inside a docker container"
+        required: false
+        default: true
+        type: boolean
+      secrets-env:
+        description: "List of secrets to be exported to environment variables"
+        type: string
+        default: ''
+      no-sudo:
+        description: If set to any value, don't use sudo to clean the workspace
+        required: false
+        default: false
+        type: boolean
+
+jobs:
+  job:
+    strategy:
+      fail-fast: false
+    name: ${{ inputs.job-name }}
+    env:
+      DOCKER_IMAGE: >-
+        ${{ inputs.docker-image == 'pytorch/almalinux-builder' && format('pytorch/almalinux-builder:{0}{1}',
+                                                                      inputs.gpu-arch-type,
+                                                                      inputs.gpu-arch-version)
+                                                            || inputs.docker-image }}
+      REPOSITORY: ${{ inputs.repository || github.repository }}
+      # Will be blank outside of this
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+      SCRIPT: ${{ inputs.script }}
+    runs-on: ${{ inputs.runner }}
+    # TODO: Eventually this should run in a container, we need to make a container that matches up
+    #       with the users for our self hosted runner infra since using actions/checkout with a root
+    #       user in a container will make it so that the directories will need to be chowned to the
+    #       ec2-user prior to a checkout being able to be run by ec2-user
+    timeout-minutes: ${{ inputs.timeout }}
+    steps:
+      - name: Clean workspace
+        env:
+          NO_SUDO: ${{ inputs.no-sudo }}
+        run: |
+          set -euxo pipefail
+          if [[ "${NO_SUDO}" == "false" ]]; then
+            echo "::group::Cleanup with-sudo debug output"
+            sudo rm -rfv "${GITHUB_WORKSPACE}"
+          else
+            echo "::group::Cleanup no-sudo debug output"
+            rm -rfv "${GITHUB_WORKSPACE}"
+          fi
+
+          mkdir -p "${GITHUB_WORKSPACE}"
+          echo "::endgroup::"
+
+      - name: Checkout repository (${{ inputs.test-infra-repository }}@${{ inputs.test-infra-ref }})
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ inputs.test-infra-repository }}
+          ref: ${{ inputs.test-infra-ref }}
+          path: test-infra
+
+          # PyTorch, the primary target for this job template, heavily
+          # relies on submodules. Clone them by default to avoid
+          # surprises.
+          submodules: 'recursive'
+
+      - name: Setup Linux
+        uses: ./test-infra/.github/actions/setup-linux
+        if: ${{ inputs.gpu-arch-type != 'rocm' }}
+
+      - name: Setup ROCM
+        uses: pytorch/pytorch/.github/actions/setup-rocm@main
+        if: ${{ inputs.gpu-arch-type == 'rocm' }}
+
+      - name: Setup SSH
+        uses: ./test-infra/.github/actions/setup-ssh
+        with:
+          github-secret: ${{ github.token }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+               docker exec -it $(docker container ps --format '{{.ID}}') bash
+
+      - name: Checkout repository (${{ inputs.repository || github.repository }}@${{ inputs.ref }})
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || github.ref }}
+          path: ${{ inputs.repository || github.repository }}
+          fetch-depth: ${{ inputs.fetch-depth }}
+          submodules: ${{ inputs.submodules }}
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./test-infra/.github/actions/calculate-docker-image
+        with:
+          use-custom-docker-registry: ${{ inputs.use-custom-docker-registry }}
+          docker-image-name: ${{ env.DOCKER_IMAGE }}
+          docker-build-dir: ${{ inputs.docker-build-dir }}
+          # This needs to be where the repository is checked out
+          working-directory: ${{ inputs.repository || github.repository }}
+
+      - name: Pull docker image
+        uses: ./test-infra/.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Check if in a container runner
+        shell: bash
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        id: install-nvidia-driver
+        uses: ./test-infra/.github/actions/setup-nvidia
+        if: ${{ inputs.gpu-arch-type == 'cuda' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER== 'false' }}
+
+      - name: Download artifacts (if any)
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        if: ${{ inputs.download-artifact != '' }}
+        with:
+          name: ${{ inputs.download-artifact }}
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Export matrix variables (if any)
+        uses: ./test-infra/.github/actions/export-matrix-variables
+        if: ${{ inputs.binary-matrix != '' }}
+        with:
+          binary-matrix: ${{ inputs.binary-matrix }}
+          target-os: "linux"
+
+      - name: Run script in container
+        if: ${{ inputs.run-with-docker == true }}
+        continue-on-error: ${{ inputs.continue-on-error }}
+        working-directory: ${{ inputs.repository || github.repository }}
+        env:
+          ALL_SECRETS: ${{ toJSON(secrets) }}
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        run: |
+          set -ex
+          {
+            echo "#!/usr/bin/env bash";
+            echo "set -eou pipefail";
+            # shellcheck disable=SC2016
+            echo 'eval "$(conda shell.bash hook)"';
+            echo "set -x";
+            echo "${SCRIPT}";
+          } > "${RUNNER_TEMP}/exec_script"
+          chmod +x "${RUNNER_TEMP}/exec_script"
+          python3 "${{ github.workspace }}/test-infra/.github/scripts/run_with_env_secrets.py" "${{ inputs.secrets-env }}"
+
+      - name: Run script outside container
+        if: ${{ inputs.run-with-docker == false }}
+        continue-on-error: ${{ inputs.continue-on-error }}
+        working-directory: ${{ inputs.repository || github.repository }}
+        env:
+          ALL_SECRETS: ${{ toJSON(secrets) }}
+        run: |
+          {
+            echo "#!/usr/bin/env bash";
+            echo "set -eou pipefail";
+            # Source conda so it's available to the script environment
+            echo "${SCRIPT}";
+          } > "${RUNNER_TEMP}/exec_script"
+          # The GITHUB_WORKFLOW env var contains the name of the workflow
+          # defined at the top of the workflow file. Unfortunately this is not
+          # enclosed in quotes in the env file, so simply eval-ing each line in
+          # the file will fail. As a workaround, we eval all env vars except
+          # for GITHUB_WORKFLOW here.
+          while read -r line; do
+            if [[ "${line}" != "GITHUB_WORKFLOW="* ]]; then
+              eval "export ${line}"
+            fi
+          done < "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
+          bash "${RUNNER_TEMP}/exec_script"
+
+      - name: Surface failing tests
+        if: always()
+        uses: pmeier/pytest-results-action@a2c1430e2bddadbad9f49a6f9b879f062c6b19b1 # v0.3.0
+        with:
+          path: ${{ env.RUNNER_TEST_RESULTS_DIR }}
+          fail-on-empty: false
+
+      - name: Chown repository directory
+        if: ${{ inputs.gpu-arch-type != 'rocm' }}
+        uses: ./test-infra/.github/actions/chown-directory
+        with:
+          directory: ${{ github.workspace }}/${{ env.repository }}
+          ALPINE_IMAGE: ${{ startsWith(inputs.runner, 'linux.arm64') && 'arm64v8/alpine' || '308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine' }}
+
+      - name: Chown runner temp
+        if: ${{ inputs.gpu-arch-type != 'rocm' }}
+        uses: ./test-infra/.github/actions/chown-directory
+        with:
+          directory: ${{ runner.temp }}
+          ALPINE_IMAGE: ${{ startsWith(inputs.runner, 'linux.arm64') && 'arm64v8/alpine' || '308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine' }}
+
+      - name: Prepare artifacts for upload
+        if: always()
+        working-directory: ${{ inputs.repository || github.repository }}
+        id: check-artifacts
+        env:
+          UPLOAD_ARTIFACT_NAME: ${{ inputs.upload-artifact }}
+        run: |
+          # Only do these steps if we actually want to upload an artifact
+          if [[ -n "${UPLOAD_ARTIFACT_NAME}" ]]; then
+            # If the default execution path is followed then we should get a wheel in the dist/ folder
+            # attempt to just grab whatever is in there and scoop it all up
+            if find "dist/" -name "*.whl" >/dev/null 2>/dev/null; then
+              mv -v dist/*.whl "${RUNNER_ARTIFACT_DIR}/"
+            fi
+            if [[ -d "artifacts-to-be-uploaded" ]]; then
+              mv -v artifacts-to-be-uploaded/* "${RUNNER_ARTIFACT_DIR}/"
+            fi
+          fi
+
+          upload_docs=0
+          # Check if there are files in the documentation folder to upload, note that
+          # empty folders do not count
+          if find "${RUNNER_DOCS_DIR}" -mindepth 1 -maxdepth 1 -type f | read -r; then
+            # TODO: Add a check here to test if on ec2 because if we're not on ec2 then this
+            # upload will probably not work correctly
+            upload_docs=1
+          fi
+          echo "upload-docs=${upload_docs}" >> "${GITHUB_OUTPUT}"
+
+      # NB: Keep this for compatibility with existing jobs and also keep in mind that only
+      # our AWS runners have access to S3
+      - name: Upload artifacts to GitHub (if any)
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: ${{ always() && inputs.upload-artifact != '' && !inputs.upload-artifact-to-s3 }}
+        with:
+          name: ${{ inputs.upload-artifact }}
+          path: ${{ runner.temp }}/artifacts/
+
+      # NB: This only works with our AWS runners
+      - name: Upload artifacts to S3 (if any)
+        uses: ./test-infra/.github/actions/upload-artifact-s3
+        if: ${{ always() && inputs.upload-artifact != '' && inputs.upload-artifact-to-s3 }}
+        with:
+          retention-days: 14
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ env.REPOSITORY }}/${{ github.run_id }}/artifacts
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Upload documentation to S3 (if any)
+        uses: ./test-infra/.github/actions/upload-artifact-s3
+        if: ${{ steps.check-artifacts.outputs.upload-docs == 1 && github.event.pull_request.number != '' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: ${{ env.RUNNER_DOCS_DIR }}
+          # ${{ env.repository }} is $OWNER/$REPO
+          s3-prefix: ${{ env.REPOSITORY }}/${{ github.event.pull_request.number }}
+          # Overwrite doc previews for the same PR
+          overwrite: true
+
+      - name: Teardown Linux
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
+        uses: ./test-infra/.github/actions/teardown-linux
+
+      - name: Clean workspace after tear down
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
+        env:
+          NO_SUDO: ${{ inputs.no-sudo }}
+          REPOSITORY: ${{ inputs.repository || github.repository }}
+        run: |
+          set +e
+          if [[ "${NO_SUDO}" == "false" ]]; then
+            sudo rm -rf "${GITHUB_WORKSPACE:?}/${REPOSITORY:?}"
+          else
+            rm -rf "${GITHUB_WORKSPACE:?}/${REPOSITORY:?}"
+          fi
+          set -e