From 415bc42288a9ddabdd270062ba89d3c096a0b6d7 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 8 Sep 2025 00:02:22 -0500 Subject: [PATCH 1/2] Added aws-actions authentication to linux_job_v2.yml. --- .github/workflows/linux_job_v2.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/linux_job_v2.yml b/.github/workflows/linux_job_v2.yml index 595af0cffa..9ecf6a2867 100644 --- a/.github/workflows/linux_job_v2.yml +++ b/.github/workflows/linux_job_v2.yml @@ -186,6 +186,19 @@ jobs: fetch-depth: ${{ inputs.fetch-depth }} submodules: ${{ inputs.submodules }} + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + - name: Calculate docker image id: calculate-docker-image uses: ./test-infra/.github/actions/calculate-docker-image From f7ee220ad18ccb7e193888ed93a727a92e6fbb70 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 3 Oct 2025 17:36:23 -0500 Subject: [PATCH 2/2] Created a new reusable workflow linux_job_v2_rocm.ymlfor rocm with aws authentication. Rolled back changes made to linux_job_v2.yml. --- .github/workflows/linux_job_v2.yml | 13 - .github/workflows/linux_job_v2_rocm.yml | 390 ++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/linux_job_v2_rocm.yml diff --git a/.github/workflows/linux_job_v2.yml b/.github/workflows/linux_job_v2.yml index 9ecf6a2867..595af0cffa 100644 --- a/.github/workflows/linux_job_v2.yml +++ b/.github/workflows/linux_job_v2.yml @@ -186,19 +186,6 @@ jobs: fetch-depth: ${{ inputs.fetch-depth }} submodules: ${{ inputs.submodules }} - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: true - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - - name: Calculate docker image id: calculate-docker-image uses: ./test-infra/.github/actions/calculate-docker-image diff --git a/.github/workflows/linux_job_v2_rocm.yml b/.github/workflows/linux_job_v2_rocm.yml new file mode 100644 index 0000000000..9ecf6a2867 --- /dev/null +++ b/.github/workflows/linux_job_v2_rocm.yml @@ -0,0 +1,390 @@ +name: Build / Test on Linux v2 + +on: + workflow_call: + inputs: + script: + description: 'Script to utilize' + default: "python setup.py bdist_wheel" + type: string + timeout: + description: 'Timeout for the job (in minutes)' + default: 30 + type: number + runner: + description: 'Runner type to utilize' + default: "linux.2xlarge" + type: string + upload-artifact: + description: | + Name to give artifacts uploaded from ${RUNNER_ARTIFACT_DIR}, all the wheel files + under dist/ and any files under artifacts-to-be-uploaded/ will be uploaded + default: '' + type: string + upload-artifact-to-s3: + description: | + Upload the artifact to S3 instead of GitHub. This is used for large artifacts like + exported model + required: false + default: false + type: boolean + download-artifact: + description: 'Name to download artifacts to ${RUNNER_ARTIFACT_DIR}' + default: '' + type: string + repository: + description: 'Repository to checkout, defaults to ""' + default: "" + type: string + fetch-depth: + description: 'Number of commits to fetch, defaults to 1 similar to actions/checkout' + default: 1 + type: number + submodules: + description: + Same as actions/checkout, set to `true` to checkout submodules or `recursive` to + recursively checkout everything + default: "" + type: string + ref: + description: 'Reference to checkout, defaults to "nightly"' + default: "" + type: string + test-infra-repository: + description: "Test infra repository to use" + default: "pytorch/test-infra" + type: string + test-infra-ref: + description: "Test infra reference to use" + default: "" + type: string + use-custom-docker-registry: + description: "Use the custom ECR registry. Applies only if build.sh exists in docker-build-dir." + default: true + type: boolean + docker-image: + description: Identifies the Docker image by name. + default: "pytorch/almalinux-builder" + type: string + docker-build-dir: + description: | + The directory containing the build.sh shell script to build the docker image. + The script parameters can be passed to docker build similar to how it is used + in PyTorch, i.e. build.sh "${IMAGE_NAME}" -t "${DOCKER_IMAGE}". + default: ".ci/docker" + type: string + gpu-arch-type: + description: "GPU arch type to use" + default: "cpu" + type: string + gpu-arch-version: + description: "GPU arch version to use" + default: "" + type: string + job-name: + description: "Name for the job, which is displayed in the GitHub UI" + default: "linux-job" + type: string + continue-on-error: + description: "Prevents a job from failing when a step fails. Set to true to allow a job to pass when exec script step fails." + default: false + type: boolean + binary-matrix: + description: "If we are calling this workflow with binary build matrix entry, will initialize matrix entries and env vars" + required: false + default: '' + type: string + run-with-docker: + description: "Whether the provided script should run inside a docker container" + required: false + default: true + type: boolean + secrets-env: + description: "List of secrets to be exported to environment variables" + type: string + default: '' + no-sudo: + description: If set to any value, don't use sudo to clean the workspace + required: false + default: false + type: boolean + +jobs: + job: + strategy: + fail-fast: false + name: ${{ inputs.job-name }} + env: + DOCKER_IMAGE: >- + ${{ inputs.docker-image == 'pytorch/almalinux-builder' && format('pytorch/almalinux-builder:{0}{1}', + inputs.gpu-arch-type, + inputs.gpu-arch-version) + || inputs.docker-image }} + REPOSITORY: ${{ inputs.repository || github.repository }} + # Will be blank outside of this + PR_NUMBER: ${{ github.event.pull_request.number }} + SCRIPT: ${{ inputs.script }} + runs-on: ${{ inputs.runner }} + # TODO: Eventually this should run in a container, we need to make a container that matches up + # with the users for our self hosted runner infra since using actions/checkout with a root + # user in a container will make it so that the directories will need to be chowned to the + # ec2-user prior to a checkout being able to be run by ec2-user + timeout-minutes: ${{ inputs.timeout }} + steps: + - name: Clean workspace + env: + NO_SUDO: ${{ inputs.no-sudo }} + run: | + set -euxo pipefail + if [[ "${NO_SUDO}" == "false" ]]; then + echo "::group::Cleanup with-sudo debug output" + sudo rm -rfv "${GITHUB_WORKSPACE}" + else + echo "::group::Cleanup no-sudo debug output" + rm -rfv "${GITHUB_WORKSPACE}" + fi + + mkdir -p "${GITHUB_WORKSPACE}" + echo "::endgroup::" + + - name: Checkout repository (${{ inputs.test-infra-repository }}@${{ inputs.test-infra-ref }}) + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # Support the use case where we need to checkout someone's fork + repository: ${{ inputs.test-infra-repository }} + ref: ${{ inputs.test-infra-ref }} + path: test-infra + + # PyTorch, the primary target for this job template, heavily + # relies on submodules. Clone them by default to avoid + # surprises. + submodules: 'recursive' + + - name: Setup Linux + uses: ./test-infra/.github/actions/setup-linux + if: ${{ inputs.gpu-arch-type != 'rocm' }} + + - name: Setup ROCM + uses: pytorch/pytorch/.github/actions/setup-rocm@main + if: ${{ inputs.gpu-arch-type == 'rocm' }} + + - name: Setup SSH + uses: ./test-infra/.github/actions/setup-ssh + with: + github-secret: ${{ github.token }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout repository (${{ inputs.repository || github.repository }}@${{ inputs.ref }}) + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # Support the use case where we need to checkout someone's fork + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || github.ref }} + path: ${{ inputs.repository || github.repository }} + fetch-depth: ${{ inputs.fetch-depth }} + submodules: ${{ inputs.submodules }} + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + - name: Calculate docker image + id: calculate-docker-image + uses: ./test-infra/.github/actions/calculate-docker-image + with: + use-custom-docker-registry: ${{ inputs.use-custom-docker-registry }} + docker-image-name: ${{ env.DOCKER_IMAGE }} + docker-build-dir: ${{ inputs.docker-build-dir }} + # This needs to be where the repository is checked out + working-directory: ${{ inputs.repository || github.repository }} + + - name: Pull docker image + uses: ./test-infra/.github/actions/pull-docker-image + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Check if in a container runner + shell: bash + id: check_container_runner + run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + id: setup-sscache-port-flag + run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + id: install-nvidia-driver + uses: ./test-infra/.github/actions/setup-nvidia + if: ${{ inputs.gpu-arch-type == 'cuda' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER== 'false' }} + + - name: Download artifacts (if any) + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + if: ${{ inputs.download-artifact != '' }} + with: + name: ${{ inputs.download-artifact }} + path: ${{ runner.temp }}/artifacts/ + + - name: Export matrix variables (if any) + uses: ./test-infra/.github/actions/export-matrix-variables + if: ${{ inputs.binary-matrix != '' }} + with: + binary-matrix: ${{ inputs.binary-matrix }} + target-os: "linux" + + - name: Run script in container + if: ${{ inputs.run-with-docker == true }} + continue-on-error: ${{ inputs.continue-on-error }} + working-directory: ${{ inputs.repository || github.repository }} + env: + ALL_SECRETS: ${{ toJSON(secrets) }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + run: | + set -ex + { + echo "#!/usr/bin/env bash"; + echo "set -eou pipefail"; + # shellcheck disable=SC2016 + echo 'eval "$(conda shell.bash hook)"'; + echo "set -x"; + echo "${SCRIPT}"; + } > "${RUNNER_TEMP}/exec_script" + chmod +x "${RUNNER_TEMP}/exec_script" + python3 "${{ github.workspace }}/test-infra/.github/scripts/run_with_env_secrets.py" "${{ inputs.secrets-env }}" + + - name: Run script outside container + if: ${{ inputs.run-with-docker == false }} + continue-on-error: ${{ inputs.continue-on-error }} + working-directory: ${{ inputs.repository || github.repository }} + env: + ALL_SECRETS: ${{ toJSON(secrets) }} + run: | + { + echo "#!/usr/bin/env bash"; + echo "set -eou pipefail"; + # Source conda so it's available to the script environment + echo "${SCRIPT}"; + } > "${RUNNER_TEMP}/exec_script" + # The GITHUB_WORKFLOW env var contains the name of the workflow + # defined at the top of the workflow file. Unfortunately this is not + # enclosed in quotes in the env file, so simply eval-ing each line in + # the file will fail. As a workaround, we eval all env vars except + # for GITHUB_WORKFLOW here. + while read -r line; do + if [[ "${line}" != "GITHUB_WORKFLOW="* ]]; then + eval "export ${line}" + fi + done < "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" + bash "${RUNNER_TEMP}/exec_script" + + - name: Surface failing tests + if: always() + uses: pmeier/pytest-results-action@a2c1430e2bddadbad9f49a6f9b879f062c6b19b1 # v0.3.0 + with: + path: ${{ env.RUNNER_TEST_RESULTS_DIR }} + fail-on-empty: false + + - name: Chown repository directory + if: ${{ inputs.gpu-arch-type != 'rocm' }} + uses: ./test-infra/.github/actions/chown-directory + with: + directory: ${{ github.workspace }}/${{ env.repository }} + ALPINE_IMAGE: ${{ startsWith(inputs.runner, 'linux.arm64') && 'arm64v8/alpine' || '308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine' }} + + - name: Chown runner temp + if: ${{ inputs.gpu-arch-type != 'rocm' }} + uses: ./test-infra/.github/actions/chown-directory + with: + directory: ${{ runner.temp }} + ALPINE_IMAGE: ${{ startsWith(inputs.runner, 'linux.arm64') && 'arm64v8/alpine' || '308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine' }} + + - name: Prepare artifacts for upload + if: always() + working-directory: ${{ inputs.repository || github.repository }} + id: check-artifacts + env: + UPLOAD_ARTIFACT_NAME: ${{ inputs.upload-artifact }} + run: | + # Only do these steps if we actually want to upload an artifact + if [[ -n "${UPLOAD_ARTIFACT_NAME}" ]]; then + # If the default execution path is followed then we should get a wheel in the dist/ folder + # attempt to just grab whatever is in there and scoop it all up + if find "dist/" -name "*.whl" >/dev/null 2>/dev/null; then + mv -v dist/*.whl "${RUNNER_ARTIFACT_DIR}/" + fi + if [[ -d "artifacts-to-be-uploaded" ]]; then + mv -v artifacts-to-be-uploaded/* "${RUNNER_ARTIFACT_DIR}/" + fi + fi + + upload_docs=0 + # Check if there are files in the documentation folder to upload, note that + # empty folders do not count + if find "${RUNNER_DOCS_DIR}" -mindepth 1 -maxdepth 1 -type f | read -r; then + # TODO: Add a check here to test if on ec2 because if we're not on ec2 then this + # upload will probably not work correctly + upload_docs=1 + fi + echo "upload-docs=${upload_docs}" >> "${GITHUB_OUTPUT}" + + # NB: Keep this for compatibility with existing jobs and also keep in mind that only + # our AWS runners have access to S3 + - name: Upload artifacts to GitHub (if any) + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: ${{ always() && inputs.upload-artifact != '' && !inputs.upload-artifact-to-s3 }} + with: + name: ${{ inputs.upload-artifact }} + path: ${{ runner.temp }}/artifacts/ + + # NB: This only works with our AWS runners + - name: Upload artifacts to S3 (if any) + uses: ./test-infra/.github/actions/upload-artifact-s3 + if: ${{ always() && inputs.upload-artifact != '' && inputs.upload-artifact-to-s3 }} + with: + retention-days: 14 + s3-bucket: gha-artifacts + s3-prefix: | + ${{ env.REPOSITORY }}/${{ github.run_id }}/artifacts + path: ${{ runner.temp }}/artifacts/ + + - name: Upload documentation to S3 (if any) + uses: ./test-infra/.github/actions/upload-artifact-s3 + if: ${{ steps.check-artifacts.outputs.upload-docs == 1 && github.event.pull_request.number != '' }} + with: + retention-days: 14 + s3-bucket: doc-previews + if-no-files-found: error + path: ${{ env.RUNNER_DOCS_DIR }} + # ${{ env.repository }} is $OWNER/$REPO + s3-prefix: ${{ env.REPOSITORY }}/${{ github.event.pull_request.number }} + # Overwrite doc previews for the same PR + overwrite: true + + - name: Teardown Linux + if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' + uses: ./test-infra/.github/actions/teardown-linux + + - name: Clean workspace after tear down + if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' + env: + NO_SUDO: ${{ inputs.no-sudo }} + REPOSITORY: ${{ inputs.repository || github.repository }} + run: | + set +e + if [[ "${NO_SUDO}" == "false" ]]; then + sudo rm -rf "${GITHUB_WORKSPACE:?}/${REPOSITORY:?}" + else + rm -rf "${GITHUB_WORKSPACE:?}/${REPOSITORY:?}" + fi + set -e