pytorch
diff --git a/‎.github/scripts/filter-matrix.py‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/filter-matrix.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/scripts/install-cuda-aarch64.sh‎
Lines changed: 14 additions & 1 deletion b/‎.github/scripts/install-cuda-aarch64.sh‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎.github/workflows/build-test-linux-aarch64-jetpack.yml‎
Lines changed: 11 additions & 12 deletions b/‎.github/workflows/build-test-linux-aarch64-jetpack.yml‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎.github/workflows/build-test-linux-aarch64.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-test-linux-aarch64.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_wheels_linux_aarch64.yml‎
Lines changed: 6 additions & 5 deletions b/‎.github/workflows/build_wheels_linux_aarch64.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎BUILD.bazel‎
Lines changed: 2 additions & 4 deletions b/‎BUILD.bazel‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎MODULE.bazel‎
Lines changed: 7 additions & 6 deletions b/‎MODULE.bazel‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎bzl_def/WORKSPACE‎
Lines changed: 0 additions & 1 deletion b/‎bzl_def/WORKSPACE‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎core/conversion/conversion.cpp‎
Lines changed: 0 additions & 6 deletions b/‎core/conversion/conversion.cpp‎
Lines changed: 0 additions & 6 deletions
@@ -7,7 +7,7 @@
 from typing import Any, Dict, List
 
 # currently we don't support python 3.13t due to tensorrt does not support 3.13t
-disabled_python_versions: List[str] = ["3.13t"]
+disabled_python_versions: List[str] = ["3.13t", "3.14", "3.14t"]
 
 # jetpack 6.2 only officially supports python 3.10 and cu126
 jetpack_python_versions: List[str] = ["3.10"]
 
@@ -5,14 +5,27 @@ install_cuda_aarch64() {
     CU_VER=${CU_VERSION:2:2}-${CU_VERSION:4:1}
     # CU_VERSION: cu129 --> CU_DOT_VER: 12.9
     CU_DOT_VER=${CU_VERSION:2:2}.${CU_VERSION:4:1}
+    # CUDA_MAJOR_VERSION: cu128 --> 12
+    CUDA_MAJOR_VERSION=${CU_VERSION:2:2}
     dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
     # nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
     dnf -y install cuda-compiler-${CU_VER}.aarch64 \
                    cuda-libraries-${CU_VER}.aarch64 \
                    cuda-libraries-devel-${CU_VER}.aarch64 \
                    libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER}
     dnf clean all
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib64:$LD_LIBRARY_PATH
+
+    nvshmem_version=3.3.9
+    nvshmem_path="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${CUDA_MAJOR_VERSION}/txz/agnostic/aarch64"
+    nvshmem_filename="libnvshmem_cuda12-linux-sbsa-${nvshmem_version}.tar.gz"
+    curl -L ${nvshmem_path}/${nvshmem_filename} -o nvshmem.tar.gz
+    tar -xzf nvshmem.tar.gz
+    cp -a libnvshmem/lib/* /usr/local/cuda/lib64/
+    cp -a libnvshmem/include/* /usr/local/cuda/include/
+    rm -rf nvshmem.tar.gz nvshmem
+    echo "nvshmem ${nvshmem_version} for cuda ${CUDA_MAJOR_VERSION} installed successfully"
+
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/include:/usr/lib64:$LD_LIBRARY_PATH
     ls -lart /usr/local/
     nvcc --version
     echo "cuda ${CU_VER} installed successfully"
 
@@ -1,17 +1,16 @@
 name: Build and test Linux aarch64 wheels for Jetpack
 
 on:
-  # TODO: Uncomment this when we have a stable release
-  # pull_request:
-  # push:
-  #   branches:
-  #     - main
-  #     - nightly
-  #     - release/*
-  #   tags:
-  #     # NOTE: Binary build pipelines should only get triggered on release candidate builds
-  #     # Release candidate tags look like: v1.11.0-rc1
-  #     - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  pull_request:
+  push:
+    branches:
+      - main
+      - nightly
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
@@ -66,7 +65,7 @@ jobs:
             post-script: packaging/post_build_script.sh
             smoke-test-script: packaging/smoke_test_script.sh
             package-name: torch_tensorrt
-    name: Build torch-tensorrt whl package
+    name: Build torch-tensorrt whl package for jetpack
     uses: ./.github/workflows/build_wheels_linux_aarch64.yml
     with:
       repository: ${{ matrix.repository }}
 
@@ -62,7 +62,7 @@ jobs:
             post-script: packaging/post_build_script.sh
             smoke-test-script: packaging/smoke_test_script.sh
             package-name: torch_tensorrt
-    name: Build torch-tensorrt whl package
+    name: Build torch-tensorrt whl package for SBSA
     uses: ./.github/workflows/build_wheels_linux_aarch64.yml
     with:
       repository: ${{ matrix.repository }}
 
@@ -133,7 +133,7 @@ jobs:
       UPLOAD_TO_BASE_BUCKET: ${{ matrix.upload_to_base_bucket }}
       ARCH: ${{ inputs.architecture }}
       BUILD_TARGET: ${{ inputs.build-target }}
-    name: build-wheel-${{ matrix.python_version }}-${{ matrix.desired_cuda }}-${{ matrix.gpu_arch_type }}
+    name: build-wheel-${{ matrix.python_version }}-${{ matrix.desired_cuda }}-${{ matrix.gpu_arch_type }}-${{ inputs.is-jetpack }}
     runs-on: ${{ matrix.validation_runner }}
     environment: ${{(inputs.trigger-event == 'schedule' || (inputs.trigger-event == 'push' && (startsWith(github.event.ref, 'refs/heads/nightly') || startsWith(github.event.ref, 'refs/tags/v')))) && 'pytorchbot-env' || ''}}
     container:
@@ -264,7 +264,7 @@ jobs:
           if [[ ${{ inputs.is-jetpack }} == false ]]; then
             ${CONDA_RUN} python setup.py bdist_wheel
           else
-            ${CONDA_RUN} python setup.py bdist_wheel --jetpack --plat-name=linux_tegra_aarch64
+            ${CONDA_RUN} python setup.py bdist_wheel --jetpack
           fi
       - name: Repair Manylinux_2_28 Wheel
         shell: bash -l {0}
@@ -335,9 +335,10 @@ jobs:
 
   upload:
     needs: build
+    name: upload-wheel-${{ matrix.python_version }}-${{ matrix.desired_cuda }}-${{ matrix.gpu_arch_type }}-${{ inputs.is-jetpack }}
     uses: pytorch/test-infra/.github/workflows/_binary_upload.yml@main
-    # for jetpack builds, only upload to pytorch index for nightly builds
-    if: ${{ inputs.is-jetpack == false || (github.event_name == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
+    # for jetpack builds, do not upload to pytorch nightly index, only upload to https://pypi.jetson-ai-lab.io/ manually for each release
+    if: ${{ inputs.is-jetpack == false }}
     with:
       repository: ${{ inputs.repository }}
       ref: ${{ inputs.ref }}
@@ -351,5 +352,5 @@ jobs:
       PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ inputs.is-jetpack }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -88,8 +88,7 @@ pkg_tar(
     name = "libtorchtrt",
     srcs = [
         "//:LICENSE",
-        "//bzl_def:BUILD",
-        "//bzl_def:WORKSPACE",
+        "//third_party/torch_tensorrt:BUILD",
     ],
     extension = "tar.gz",
     package_dir = "torch_tensorrt",
@@ -107,8 +106,7 @@ pkg_tar(
     name = "libtorchtrt_runtime",
     srcs = [
         "//:LICENSE",
-        "//bzl_def:BUILD",
-        "//bzl_def:WORKSPACE",
+        "//third_party/torch_tensorrt:BUILD",
     ],
     extension = "tar.gz",
     package_dir = "torch_tensorrt_runtime",
 
@@ -24,14 +24,16 @@ git_override(
 
 local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "local_repository")
 
+
+new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+
 # External dependency for torch_tensorrt if you already have precompiled binaries.
-local_repository(
+new_local_repository(
     name = "torch_tensorrt",
-    path = "/opt/conda/lib/python3.11/site-packages/torch_tensorrt",
+    build_file = "@//third_party/torch_tensorrt:BUILD",
+    path = "/usr/local/lib/python3.12/site-packages/torch_tensorrt/",
 )
 
-new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
-
 # CUDA should be installed on the system locally
 # for linux x86_64 and aarch64
 new_local_repository(
@@ -90,10 +92,9 @@ http_archive(
 http_archive(
     name = "torch_l4t",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "6eff643c0a7acda92734cc798338f733ff35c7df1a4434576f5ff7c66fc97319",
     strip_prefix = "torch",
     type = "zip",
-    urls = ["https://pypi.jetson-ai-lab.dev/jp6/cu126/+f/6ef/f643c0a7acda9/torch-2.7.0-cp310-cp310-linux_aarch64.whl"],
+    urls = ["https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl"],
 )
 
 # Download these tarballs manually from the NVIDIA website
 
@@ -7,7 +7,7 @@ Torch-TensorRT
 [![Documentation](https://img.shields.io/badge/docs-master-brightgreen)](https://nvidia.github.io/Torch-TensorRT/)
 [![pytorch](https://img.shields.io/badge/PyTorch-2.8-green)](https://download.pytorch.org/whl/nightly/cu128)
 [![cuda](https://img.shields.io/badge/CUDA-12.8-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TensorRT-10.11.0-green)](https://github.com/nvidia/tensorrt-llm)
+[![trt](https://img.shields.io/badge/TensorRT-10.12.0-green)](https://github.com/nvidia/tensorrt-llm)
 [![license](https://img.shields.io/badge/license-BSD--3--Clause-blue)](./LICENSE)
 [![Linux x86-64 Nightly Wheels](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux-x86_64.yml/badge.svg?branch=nightly)](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux-x86_64.yml)
 [![Linux SBSA Nightly Wheels](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux-aarch64.yml/badge.svg?branch=nightly)](https://github.com/pytorch/TensorRT/actions/workflows/build-test-linux-aarch64.yml)
@@ -93,9 +93,11 @@ auto results = trt_mod.forward({input_tensor});
 ```
 
 ## Further resources
+- [Double PyTorch Inference Speed for Diffusion Models Using Torch-TensorRT](https://developer.nvidia.com/blog/double-pytorch-inference-speed-for-diffusion-models-using-torch-tensorrt/)
 - [Up to 50% faster Stable Diffusion inference with one line of code](https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion.html#sphx-glr-tutorials-rendered-examples-dynamo-torch-compile-stable-diffusion-py)
-- [Optimize LLMs from Hugging Face with Torch-TensorRT]() \[coming soon\]
+- [Optimize LLMs from Hugging Face with Torch-TensorRT](https://docs.pytorch.org/TensorRT/tutorials/compile_hf_models.html#compile-hf-models)
 - [Run your model in FP8 with Torch-TensorRT](https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq.html)
+- [Accelerated Inference in PyTorch 2.X with Torch-TensorRT](https://www.youtube.com/watch?v=eGDMJ3MY4zk&t=1s)
 - [Tools to resolve graph breaks and boost performance]() \[coming soon\]
 - [Tech Talk (GTC '23)](https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s51714/)
 - [Documentation](https://nvidia.github.io/Torch-TensorRT/)
@@ -119,9 +121,9 @@ auto results = trt_mod.forward({input_tensor});
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 8.1.1
-- Libtorch 2.8.0.dev (latest nightly)
+- Libtorch 2.9.0.dev (latest nightly)
 - CUDA 12.8 (CUDA 12.6 on Jetson)
-- TensorRT 10.11 (TensorRT 10.3 on Jetson)
+- TensorRT 10.12 (TensorRT 10.3 on Jetson)
 
 ## Deprecation Policy
 
 
@@ -202,13 +202,7 @@ void AddInputs(ConversionCtx* ctx, c10::ArrayRef<const torch::jit::Value*> input
   TORCHTRT_CHECK(
       profile->isValid(),
       "Optimization profile is invalid, please check the input range provided (conversion.AddInputs)");
-
   ctx->cfg->addOptimizationProfile(profile);
-#if NV_TENSORRT_MAJOR > 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR >= 1)
-  if (ctx->enabled_precisions.find(nvinfer1::DataType::kINT8) != ctx->enabled_precisions.end()) {
-    ctx->cfg->setCalibrationProfile(profile);
-  }
-#endif
 }
 
 void MarkOutputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> outputs) {