pytorch · lanluo-nvidia · Nov 14, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/.github/scripts/generate-tensorrt-test-matrix.py b/.github/scripts/generate-tensorrt-test-matrix.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+import argparse
+import copy
+import json
+import sys
+
+CUDA_VERSIONS_DICT = {
+    "nightly": ["cu124"],
+    "test": ["cu121", "cu124"],
+    "release": ["cu121", "cu124"],
+}
+
+PYTHON_VERSIONS_DICT = {
+    "nightly": ["3.9"],
+    "test": ["3.9", "3.10", "3.11", "3.12"],
+    "release": ["3.9", "3.10", "3.11", "3.12"],
+}
+
+TENSORRT_VERSIONS_DICT = {
+    "windows": {
+        "10.4.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip",
+            "strip_prefix": "TensorRT-10.4.0.26",
+            "sha256": "3a7de83778b9e9f812fd8901e07e0d7d6fc54ce633fcff2e340f994df2c6356c",
+        },
+        "10.5.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/zip/TensorRT-10.5.0.18.Windows.win10.cuda-12.6.zip",
+            "strip_prefix": "TensorRT-10.5.0.18",
+            "sha256": "e6436f4164db4e44d727354dccf7d93755efb70d6fbfd6fa95bdfeb2e7331b24",
+        },
+        "10.6.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip",
+            "strip_prefix": "TensorRT-10.6.0.26",
+            "sha256": "6c6d92c108a1b3368423e8f69f08d31269830f1e4c9da43b37ba34a176797254",
+        },
+    },
+    "linux": {
+        "10.4.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-10.4.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
+            "strip_prefix": "TensorRT-10.4.0.26",
+            "sha256": "cb0273ecb3ba4db8993a408eedd354712301a6c7f20704c52cdf9f78aa97bbdb",
+        },
+        "10.5.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz",
+            "strip_prefix": "TensorRT-10.5.0.18",
+            "sha256": "f404d379d639552a3e026cd5267213bd6df18a4eb899d6e47815bbdb34854958",
+        },
+        "10.6.0": {
+            "urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
+            "strip_prefix": "TensorRT-10.6.0.26",
+            "sha256": "33d3c2f3f4c84dc7991a4337a6fde9ed33f5c8e5c4f03ac2eb6b994a382b03a0",
+        },
+    },
+}
+
+
+def main(args: list[str]) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--matrix",
+        help="matrix",
+        type=str,
+        default="",
+    )
+
+    options = parser.parse_args(args)
+    if options.matrix == "":
+        raise Exception("--matrix is empty, please provide the matrix json str")
+
+    matrix_dict = json.loads(options.matrix)
+    includes = matrix_dict["include"]
+    assert len(includes) > 0
+    if "channel" not in includes[0]:
+        raise Exception(f"channel field is missing from the matrix: {options.matrix}")
+    channel = includes[0]["channel"]
+    if channel not in ("nightly", "test", "release"):
+        raise Exception(
+            f"channel field: {channel} is not supported, currently supported value: nightly, test, release"
+        )
+
+    if "validation_runner" not in includes[0]:
+        raise Exception(
+            f"validation_runner field is missing from the matrix: {options.matrix}"
+        )
+    if "windows" in includes[0]["validation_runner"]:
+        arch = "windows"
+    elif "linux" in includes[0]["validation_runner"]:
+        arch = "linux"
+    else:
+        raise Exception(
+            f"{includes[0].validation_runner} is not the supported arch, currently only support windows and linux"
+        )
+
+    cuda_versions = CUDA_VERSIONS_DICT[channel]
+    python_versions = PYTHON_VERSIONS_DICT[channel]
+    tensorrt_versions = TENSORRT_VERSIONS_DICT[arch]
+
+    filtered_includes = []
+    for item in includes:
+        if (
+            item["desired_cuda"] in cuda_versions
+            and item["python_version"] in python_versions
+        ):
+            for tensorrt_version, tensorrt_json in tensorrt_versions.items():
+                new_item = copy.deepcopy(item)
+                tensorrt_json["version"] = tensorrt_version
+                new_item["tensorrt"] = tensorrt_json
+                filtered_includes.append(new_item)
+    filtered_matrix_dict = {}
+    filtered_matrix_dict["include"] = filtered_includes
+    print(json.dumps(filtered_matrix_dict))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/.github/workflows/build-tensorrt-linux.yml b/.github/workflows/build-tensorrt-linux.yml
@@ -0,0 +1,222 @@
+name: Build Torch-TensorRT wheel on Linux with specified tensorRT version
+
+on:
+  workflow_call:
+    inputs:
+      repository:
+        description: 'Repository to checkout, defaults to ""'
+        default: ""
+        type: string
+      ref:
+        description: 'Reference to checkout, defaults to "nightly"'
+        default: "nightly"
+        type: string
+      test-infra-repository:
+        description: "Test infra repository to use"
+        default: "pytorch/test-infra"
+        type: string
+      test-infra-ref:
+        description: "Test infra reference to use"
+        default: ""
+        type: string
+      build-matrix:
+        description: "Build matrix to utilize"
+        default: ""
+        type: string
+      pre-script:
+        description: "Pre script to run prior to build"
+        default: ""
+        type: string
+      post-script:
+        description: "Post script to run prior to build"
+        default: ""
+        type: string
+      smoke-test-script:
+        description: "Script for Smoke Test for a specific domain"
+        default: ""
+        type: string
+      env-var-script:
+        description: "Script that sets Domain-Specific Environment Variables"
+        default: ""
+        type: string
+      package-name:
+        description: "Name of the actual python package that is imported"
+        default: ""
+        type: string
+      trigger-event:
+        description: "Trigger Event in caller that determines whether or not to upload"
+        default: ""
+        type: string
+      cache-path:
+        description: "The path(s) on the runner to cache or restore. The path is relative to repository."
+        default: ""
+        type: string
+      cache-key:
+        description: "The key created when saving a cache and the key used to search for a cache."
+        default: ""
+        type: string
+      architecture:
+        description: Architecture to build for x86_64 for default Linux, or aarch64 for Linux aarch64 builds
+        required: false
+        type: string
+        default: x86_64
+      submodules:
+        description: Works as stated in actions/checkout, but the default value is recursive
+        required: false
+        type: string
+        default: recursive
+      setup-miniconda:
+        description: Set to true if setup-miniconda is needed
+        required: false
+        type: boolean
+        default: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(inputs.build-matrix) }}
+    env:
+      PYTHON_VERSION: ${{ matrix.python_version }}
+      PACKAGE_TYPE: wheel
+      REPOSITORY: ${{ inputs.repository }}
+      REF: ${{ inputs.ref }}
+      CU_VERSION: ${{ matrix.desired_cuda }}
+      UPLOAD_TO_BASE_BUCKET: ${{ matrix.upload_to_base_bucket }}
+      ARCH: ${{ inputs.architecture }}
+      TENSORRT_STRIP_PREFIX: ${{ matrix.tensorrt.strip_prefix }}
+      TENSORRT_VERSION: ${{ matrix.tensorrt.version }}
+      TENSORRT_URLS: ${{ matrix.tensorrt.urls }}
+      TENSORRT_SHA256: ${{ matrix.tensorrt.sha256 }}
+      UPLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
+    name: build_tensorrt${{ matrix.tensorrt.version }}_py${{matrix.python_version}}_${{matrix.desired_cuda}}
+    runs-on: ${{ matrix.validation_runner }}
+    container:
+      image: ${{ matrix.container_image }}
+      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
+    # If a build is taking longer than 120 minutes on these runners we need
+    # to have a conversation
+    timeout-minutes: 120
+
+    steps:
+      - name: Clean workspace
+        shell: bash -l {0}
+        run: |
+          set -x
+          echo "::group::Cleanup debug output"
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir -p "${GITHUB_WORKSPACE}"
+          if [[ "${{ inputs.architecture }}" = "aarch64" ]]; then
+            rm -rf "${RUNNER_TEMP}/*"
+          fi
+          echo "::endgroup::"
+      - uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ inputs.test-infra-repository }}
+          ref: ${{ inputs.test-infra-ref }}
+          path: test-infra
+      - uses: actions/checkout@v3
+        if: ${{ env.ARCH == 'aarch64' }}
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: "pytorch/builder"
+          ref: "main"
+          path: builder
+      - name: Set linux aarch64 CI
+        if: ${{ inputs.architecture == 'aarch64' }}
+        shell: bash -l {0}
+        env:
+          DESIRED_PYTHON: ${{ matrix.python_version }}
+        run: |
+          set +e
+          # TODO: This is temporary aarch64 setup script, this should be integrated into aarch64 docker.
+          ${GITHUB_WORKSPACE}/builder/aarch64_linux/aarch64_ci_setup.sh
+          echo "/opt/conda/bin" >> $GITHUB_PATH
+          set -e
+      - uses: ./test-infra/.github/actions/set-channel
+      - name: Set PYTORCH_VERSION
+        if: ${{ env.CHANNEL == 'test' }}
+        run: |
+          # When building RC, set the version to be the current candidate version,
+          # otherwise, leave it alone so nightly will pick up the latest
+          echo "PYTORCH_VERSION=${{ matrix.stable_version }}" >> "${GITHUB_ENV}"
+      - uses: ./test-infra/.github/actions/setup-binary-builds
+        env:
+          PLATFORM: ${{ inputs.architecture == 'aarch64'  && 'linux-aarch64' || ''}}
+        with:
+          repository: ${{ inputs.repository }}
+          ref: ${{ inputs.ref }}
+          submodules: ${{ inputs.submodules }}
+          setup-miniconda: ${{ inputs.setup-miniconda }}
+          python-version: ${{ env.PYTHON_VERSION }}
+          cuda-version: ${{ env.CU_VERSION }}
+          arch: ${{ env.ARCH }}
+      - name: Combine Env Var and Build Env Files
+        if: ${{ inputs.env-var-script != '' }}
+        working-directory: ${{ inputs.repository }}
+        shell: bash -l {0}
+        run: |
+          cat "${{ inputs.env-var-script }}" >> "${BUILD_ENV_FILE}"
+      - name: Install torch dependency
+        shell: bash -l {0}
+        run: |
+          set -x
+          # shellcheck disable=SC1090
+          source "${BUILD_ENV_FILE}"
+          # shellcheck disable=SC2086
+          ${CONDA_RUN} ${PIP_INSTALL_TORCH}
+      - name: Run Pre-Script with Caching
+        if: ${{ inputs.pre-script != '' }}
+        uses: ./test-infra/.github/actions/run-script-with-cache
+        with:
+          cache-path: ${{ inputs.cache-path }}
+          cache-key: ${{ inputs.cache-key }}
+          repository: ${{ inputs.repository  }}
+          script: ${{ inputs.pre-script }}
+      - name: Build clean
+        working-directory: ${{ inputs.repository }}
+        shell: bash -l {0}
+        run: |
+          set -x
+          source "${BUILD_ENV_FILE}"
+          ${CONDA_RUN} python setup.py clean
+      - name: Build the wheel (bdist_wheel)
+        working-directory: ${{ inputs.repository }}
+        shell: bash -l {0}
+        run: |
+          set -x
+          source "${BUILD_ENV_FILE}"
+          ${CONDA_RUN} python setup.py bdist_wheel
+
+      - name: Run Post-Script
+        if: ${{ inputs.post-script != '' }}
+        uses: ./test-infra/.github/actions/run-script-with-cache
+        with:
+          repository: ${{ inputs.repository  }}
+          script: ${{ inputs.post-script }}
+      - name: Smoke Test
+        shell: bash -l {0}
+        env:
+          PACKAGE_NAME: ${{ inputs.package-name }}
+          SMOKE_TEST_SCRIPT: ${{ inputs.smoke-test-script }}
+        run: |
+          set -x
+          source "${BUILD_ENV_FILE}"
+          # TODO: add smoke test for the auditwheel tarball built
+
+      # NB: Only upload to GitHub after passing smoke tests
+      - name: Upload wheel to GitHub
+        continue-on-error: true
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.UPLOAD_ARTIFACT_NAME }}
+          path: ${{ inputs.repository }}/dist
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
+  cancel-in-progress: true