From f203595427ccd00435d62adde3ca4fa0067682ab Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 13 Feb 2025 12:10:45 -0800
Subject: [PATCH 01/15] TensorRT-LLM import fix and aot_joint_export specify as
 explicit setting in dynamo.compile

TRT-LLM installation utilities and adding test cases

adding the option in _compiler.py

changes in the TRT-LLM loading tool- removing install_wget, install_unzip, install_mpi

Further changes in error logging of the TRT-LLM installation tool

moving the load_tensorrt_llm to dynamo/utils.py

correcting misprint for TRT LLM load

Using python lib for download to make it platform agnostic

dll file path update for windows

correcting the non critical lint error

Including version in versions.txt
---
 dev_dep_versions.yml                          |   1 +
 py/torch_tensorrt/dynamo/_compiler.py         |  12 ++
 .../dynamo/conversion/converter_utils.py      |  67 +--------
 .../conversion/custom_ops_converters.py       |   6 +-
 py/torch_tensorrt/dynamo/utils.py             | 130 +++++++++++++++++-
 setup.py                                      |   4 +
 6 files changed, 153 insertions(+), 67 deletions(-)

diff --git a/dev_dep_versions.yml b/dev_dep_versions.yml
index c9a738feb6..c57a2d8d9e 100644
--- a/dev_dep_versions.yml
+++ b/dev_dep_versions.yml
@@ -1,2 +1,3 @@
 __cuda_version__: "12.8"
 __tensorrt_version__: "10.12.0"
+__tensorrt_llm_version__: "0.17.0.post1"
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 608c8e84c9..165f79b2c8 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -339,6 +341,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -439,6 +442,7 @@ def compile(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -514,7 +518,11 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+<<<<<<< HEAD
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+=======
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+>>>>>>> c3b62d239 (TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile)
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -688,6 +696,7 @@ def compile(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -1051,6 +1060,7 @@ def convert_exported_program_to_serialized_trt_engine(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1114,6 +1124,7 @@ def convert_exported_program_to_serialized_trt_engine(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -1236,6 +1247,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 7d7f4274ff..8771dad795 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -1,8 +1,6 @@
 import collections
-import ctypes
 import functools
 import logging
-import os
 from typing import (
     Any,
     Callable,
@@ -24,6 +22,7 @@
 from torch.fx.node import Argument, Target
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch_tensorrt import _enums
+from torch_tensorrt._enums import Platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
@@ -1120,69 +1119,6 @@ def args_bounds_check(
     return args[i] if len(args) > i and args[i] is not None else replacement
 
 
-def load_tensorrt_llm() -> bool:
-    """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-
-    Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
-    """
-    try:
-        import tensorrt_llm as trt_llm  # noqa: F401
-
-        _LOGGER.info("TensorRT-LLM successfully imported")
-        return True
-    except (ImportError, AssertionError) as e_import_error:
-        # Check for environment variable for the plugin library path
-        plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-        if not plugin_lib_path:
-            _LOGGER.warning(
-                "TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
-            )
-            return False
-
-        _LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
-        try:
-            # Load the shared library
-            handle = ctypes.CDLL(plugin_lib_path)
-            _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-        except OSError as e_os_error:
-            _LOGGER.error(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
-                exc_info=e_os_error,
-            )
-            return False
-
-        try:
-            # Configure plugin initialization arguments
-            handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-            handle.initTrtLlmPlugins.restype = ctypes.c_bool
-        except AttributeError as e_plugin_unavailable:
-            _LOGGER.warning(
-                "Unable to initialize the TensorRT-LLM plugin library",
-                exc_info=e_plugin_unavailable,
-            )
-            return False
-
-        try:
-            # Initialize the plugin
-            TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-            if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
-                _LOGGER.info("TensorRT-LLM plugin successfully initialized")
-                return True
-            else:
-                _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
-                return False
-        except Exception as e_initialization_error:
-            _LOGGER.warning(
-                "Exception occurred during TensorRT-LLM plugin library initialization",
-                exc_info=e_initialization_error,
-            )
-            return False
-    return False
-
-
 def promote_trt_tensors_to_same_dtype(
     ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
 ) -> tuple[TRTTensor, TRTTensor]:
@@ -1220,3 +1156,4 @@ def promote_trt_tensors_to_same_dtype(
     rhs_cast = cast_trt_tensor(ctx, rhs, promoted_dtype, f"{name_prefix}rhs_cast")
 
     return lhs_cast, rhs_cast
+    
diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index 1442c2b17b..d7c89fd2b7 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -11,7 +11,11 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     dynamo_tensorrt_converter,
 )
-from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
+from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+    tensorrt_fused_nccl_all_gather_op,
+    tensorrt_fused_nccl_reduce_scatter_op,
+)
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index de736db1bf..1b47302803 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
+import ctypes
 import gc
 import logging
+import os
+import urllib.request
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
@@ -14,9 +17,10 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._Device import Device
-from torch_tensorrt._enums import dtype
+from torch_tensorrt._enums import Platform, dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
+from torch_tensorrt._version import __tensorrt_llm_version__
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
@@ -851,3 +855,127 @@ def is_tegra_platform() -> bool:
     if torch.cuda.get_device_capability() in [(8, 7), (7, 2)]:
         return True
     return False
+
+
+def download_plugin_lib_path(py_version: str, platform: str) -> str:
+    plugin_lib_path = None
+
+    # Downloading TRT-LLM lib
+    base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+    file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{py_version}-{py_version}-{platform}.whl"
+    download_url = base_url + file_name
+    if not (os.path.exists(file_name)):
+        try:
+            logger.debug(f"Downloading {download_url} ...")
+            urllib.request.urlretrieve(download_url, file_name)
+            logger.debug("Download succeeded and TRT-LLM wheel is now present")
+        except urllib.error.HTTPError as e:
+            logger.error(
+                f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
+            )
+        except urllib.error.URLError as e:
+            logger.error(
+                f"URL error when trying to download {download_url}: {e.reason}"
+            )
+        except OSError as e:
+            logger.error(f"Local file write error: {e}")
+
+    # Proceeding with the unzip of the wheel file
+    # This will exist if the filename was already downloaded
+    if "linux" in platform:
+        lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
+    else:
+        lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
+    plugin_lib_path = os.path.join("./tensorrt_llm/libs", lib_filename)
+    if os.path.exists(plugin_lib_path):
+        return plugin_lib_path
+    try:
+        import zipfile
+    except ImportError as e:
+        raise ImportError(
+            "zipfile module is required but not found. Please install zipfile"
+        )
+    with zipfile.ZipFile(file_name, "r") as zip_ref:
+        zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
+        plugin_lib_path = "./tensorrt_llm/libs/" + lib_filename
+    return plugin_lib_path
+
+
+def load_tensorrt_llm() -> bool:
+    """
+    Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+
+    Returns:
+        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+    """
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    if not plugin_lib_path:
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
+            logger.warning(
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
+            )
+            return False
+        else:
+            # this is used as the default py version
+            py_version = "cp310"
+            platform = Platform.current_platform()
+
+            platform = str(platform).lower()
+            plugin_lib_path = download_plugin_lib_path(py_version, platform)
+
+    try:
+        # Load the shared TRT-LLM file
+        handle = ctypes.CDLL(plugin_lib_path)
+        logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
+    except OSError as e_os_error:
+        if "libmpi" in str(e_os_error):
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"The dependency libmpi.so is missing. "
+                f"Please install the packages libmpich-dev and libopenmpi-dev.",
+                exc_info=e_os_error,
+            )
+        else:
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
+                f"Ensure the path is correct and the library is compatible",
+                exc_info=e_os_error,
+            )
+        return False
+
+    try:
+        # Configure plugin initialization arguments
+        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    except AttributeError as e_plugin_unavailable:
+        logger.warning(
+            "Unable to initialize the TensorRT-LLM plugin library",
+            exc_info=e_plugin_unavailable,
+        )
+        return False
+
+    try:
+        # Initialize the plugin
+        TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
+        if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
+            logger.info("TensorRT-LLM plugin successfully initialized")
+            return True
+        else:
+            logger.warning("TensorRT-LLM plugin library failed in initialization")
+            return False
+    except Exception as e_initialization_error:
+        logger.warning(
+            "Exception occurred during TensorRT-LLM plugin library initialization",
+            exc_info=e_initialization_error,
+        )
+        return False
+    return False
diff --git a/setup.py b/setup.py
index 4dbfe84334..62b3181e6a 100644
--- a/setup.py
+++ b/setup.py
@@ -28,6 +28,7 @@
 __version__: str = "0.0.0"
 __cuda_version__: str = "0.0"
 __tensorrt_version__: str = "0.0"
+__tensorrt_llm_version__: str = "0.0"
 
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
 
@@ -63,6 +64,7 @@ def get_base_version() -> str:
 def load_dep_info():
     global __cuda_version__
     global __tensorrt_version__
+    global __tensorrt_llm_version__
     with open("dev_dep_versions.yml", "r") as stream:
         versions = yaml.safe_load(stream)
         if (gpu_arch_version := os.environ.get("CU_VERSION")) is not None:
@@ -72,6 +74,7 @@ def load_dep_info():
         else:
             __cuda_version__ = versions["__cuda_version__"]
         __tensorrt_version__ = versions["__tensorrt_version__"]
+        __tensorrt_llm_version__ = versions["__tensorrt_llm_version__"]
 
 
 load_dep_info()
@@ -224,6 +227,7 @@ def gen_version_file():
         f.write('__version__ = "' + __version__ + '"\n')
         f.write('__cuda_version__ = "' + __cuda_version__ + '"\n')
         f.write('__tensorrt_version__ = "' + __tensorrt_version__ + '"\n')
+        f.write('__tensorrt_llm_version__ = "' + __tensorrt_llm_version__ + '"\n')
 
 
 def copy_libtorchtrt(multilinux=False, rt_only=False):

From f7fe3011ab6e6142dddda3a57f05c17826fed672 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 20 May 2025 12:42:44 -0700
Subject: [PATCH 02/15] linting error fixes and rebase fix

---
 py/torch_tensorrt/dynamo/_compiler.py                  | 3 ---
 py/torch_tensorrt/dynamo/conversion/converter_utils.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 165f79b2c8..5f62506a02 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -518,11 +518,8 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-<<<<<<< HEAD
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
-=======
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
->>>>>>> c3b62d239 (TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile)
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 8771dad795..c080b6af24 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -1156,4 +1156,3 @@ def promote_trt_tensors_to_same_dtype(
     rhs_cast = cast_trt_tensor(ctx, rhs, promoted_dtype, f"{name_prefix}rhs_cast")
 
     return lhs_cast, rhs_cast
-    

From 1eccce8c380b6e635cdb65dbca85089ceacd7dbe Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 20 May 2025 12:56:57 -0700
Subject: [PATCH 03/15] removing Platform enum from converter_utils.py

---
 py/torch_tensorrt/dynamo/conversion/converter_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index c080b6af24..89f821b954 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -22,7 +22,6 @@
 from torch.fx.node import Argument, Target
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch_tensorrt import _enums
-from torch_tensorrt._enums import Platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext

From 945358404f7c0a1e86cc447630d2c916284bfb9e Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Fri, 13 Jun 2025 14:51:45 -0700
Subject: [PATCH 04/15] Addressing review comments- tmp dir for wheel download
 and wheel extraction, variable for py_version

---
 py/torch_tensorrt/dynamo/utils.py            | 194 +++++++++++++------
 tests/py/dynamo/distributed/test_nccl_ops.py |   1 +
 2 files changed, 136 insertions(+), 59 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 1b47302803..c0ddcc8bc5 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -2,13 +2,27 @@
 
 import ctypes
 import gc
+import getpass
 import logging
 import os
+import tempfile
 import urllib.request
 import warnings
+from contextlib import contextmanager
 from dataclasses import fields, replace
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import sympy
@@ -37,6 +51,7 @@
 RTOL = 5e-3
 ATOL = 5e-3
 CPU_DEVICE = "cpu"
+_WHL_CPYTHON_VERSION = "cp310"
 
 
 class Frameworks(Enum):
@@ -271,6 +286,19 @@ def set_log_level(parent_logger: Any, level: Any) -> None:
     """
     if parent_logger:
         parent_logger.setLevel(level)
+        print("Handlers for parent_logger:", parent_logger.handlers)
+        print("bool check--", parent_logger.hasHandlers())
+        if parent_logger.hasHandlers():
+            ch = logging.StreamHandler()
+            ch.setLevel(logging.DEBUG)  # Allow debug messages on handler
+            formatter = logging.Formatter(
+                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+            )
+            ch.setFormatter(formatter)
+            parent_logger.addHandler(ch)
+            print("Logger level:", parent_logger.level)
+            # print("Parent logger level:", logger.parent.level)
+            print("Root logger level:", logging.getLogger().level)
 
     if ENABLED_FEATURES.torch_tensorrt_runtime:
         if level == logging.DEBUG:
@@ -857,17 +885,41 @@ def is_tegra_platform() -> bool:
     return False
 
 
-def download_plugin_lib_path(py_version: str, platform: str) -> str:
-    plugin_lib_path = None
+@contextmanager
+def download_plugin_lib_path(platform: str) -> Iterator[str]:
+    """
+    Downloads (if needed) and extracts the TensorRT-LLM plugin wheel for the specified platform,
+    then yields the path to the extracted shared library (.so or .dll).
 
-    # Downloading TRT-LLM lib
-    base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-    file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{py_version}-{py_version}-{platform}.whl"
-    download_url = base_url + file_name
-    if not (os.path.exists(file_name)):
+    The wheel file is cached in a user-specific temporary directory to avoid repeated downloads.
+    Extraction happens in a temporary directory that is cleaned up after use.
+
+    Args:
+        platform (str): The platform identifier string (e.g., 'linux_x86_64') to select the correct wheel.
+
+    Yields:
+        str: The full path to the extracted TensorRT-LLM shared library file.
+
+    Raises:
+        ImportError: If the 'zipfile' module is not available.
+        RuntimeError: If the wheel file is missing, corrupted, or extraction fails.
+    """
+    plugin_lib_path = None
+    username = getpass.getuser()
+    torchtrt_cache_dir = Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
+    torchtrt_cache_dir.mkdir(parents=True, exist_ok=True)
+    file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-{_WHL_CPYTHON_VERSION}-{platform}.whl"
+    torchtrt_cache_trtllm_whl = torchtrt_cache_dir / file_name
+    downloaded_file_path = torchtrt_cache_trtllm_whl
+
+    if not torchtrt_cache_trtllm_whl.exists():
+        # Downloading TRT-LLM lib
+        base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+        download_url = base_url + file_name
+        print("Downloading TRT-LLM wheel")
         try:
             logger.debug(f"Downloading {download_url} ...")
-            urllib.request.urlretrieve(download_url, file_name)
+            urllib.request.urlretrieve(download_url, downloaded_file_path)
             logger.debug("Download succeeded and TRT-LLM wheel is now present")
         except urllib.error.HTTPError as e:
             logger.error(
@@ -880,60 +932,53 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str:
         except OSError as e:
             logger.error(f"Local file write error: {e}")
 
-    # Proceeding with the unzip of the wheel file
-    # This will exist if the filename was already downloaded
+    # Proceeding with the unzip of the wheel file in tmpdir
     if "linux" in platform:
         lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
     else:
         lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
-    plugin_lib_path = os.path.join("./tensorrt_llm/libs", lib_filename)
-    if os.path.exists(plugin_lib_path):
-        return plugin_lib_path
-    try:
-        import zipfile
-    except ImportError as e:
-        raise ImportError(
-            "zipfile module is required but not found. Please install zipfile"
-        )
-    with zipfile.ZipFile(file_name, "r") as zip_ref:
-        zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
-        plugin_lib_path = "./tensorrt_llm/libs/" + lib_filename
-    return plugin_lib_path
-
 
-def load_tensorrt_llm() -> bool:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            import zipfile
+        except ImportError:
+            raise ImportError(
+                "zipfile module is required but not found. Please install zipfile"
+            )
+        try:
+            with zipfile.ZipFile(downloaded_file_path, "r") as zip_ref:
+                zip_ref.extractall(tmpdir)  # Extract to a folder named 'tensorrt_llm'
+        except FileNotFoundError as e:
+            # This should capture the errors in the download failure above
+            logger.error(f"Wheel file not found at {downloaded_file_path}: {e}")
+            raise RuntimeError(
+                f"Failed to find downloaded wheel file at {downloaded_file_path}"
+            ) from e
+        except zipfile.BadZipFile as e:
+            logger.error(f"Invalid or corrupted wheel file: {e}")
+            raise RuntimeError(
+                "Downloaded wheel file is corrupted or not a valid zip archive"
+            ) from e
+        except Exception as e:
+            logger.error(f"Unexpected error while extracting wheel: {e}")
+            raise RuntimeError(
+                "Unexpected error during extraction of TensorRT-LLM wheel"
+            ) from e
+        plugin_lib_path = os.path.join(tmpdir, "tensorrt_llm/libs", lib_filename)
+        yield plugin_lib_path
+
+
+def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
     """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
-    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+    Loads and initializes the TensorRT-LLM plugin from the given shared library path.
+
+    Args:
+        plugin_lib_path (str): Path to the shared TensorRT-LLM plugin library.
 
     Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+        bool: True if successful, False otherwise.
     """
-    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-    if not plugin_lib_path:
-        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
-        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
-            "1",
-            "true",
-            "yes",
-            "on",
-        )
-        if not use_trtllm_plugin:
-            logger.warning(
-                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
-            )
-            return False
-        else:
-            # this is used as the default py version
-            py_version = "cp310"
-            platform = Platform.current_platform()
-
-            platform = str(platform).lower()
-            plugin_lib_path = download_plugin_lib_path(py_version, platform)
-
     try:
-        # Load the shared TRT-LLM file
         handle = ctypes.CDLL(plugin_lib_path)
         logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
     except OSError as e_os_error:
@@ -946,14 +991,13 @@ def load_tensorrt_llm() -> bool:
             )
         else:
             logger.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"Ensure the path is correct and the library is compatible.",
                 exc_info=e_os_error,
             )
         return False
 
     try:
-        # Configure plugin initialization arguments
         handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
         handle.initTrtLlmPlugins.restype = ctypes.c_bool
     except AttributeError as e_plugin_unavailable:
@@ -964,9 +1008,7 @@ def load_tensorrt_llm() -> bool:
         return False
 
     try:
-        # Initialize the plugin
-        TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-        if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
+        if handle.initTrtLlmPlugins(None, b"tensorrt_llm"):
             logger.info("TensorRT-LLM plugin successfully initialized")
             return True
         else:
@@ -979,3 +1021,37 @@ def load_tensorrt_llm() -> bool:
         )
         return False
     return False
+
+
+def load_tensorrt_llm() -> bool:
+    """
+    Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+
+    Returns:
+        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+    """
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    if plugin_lib_path:
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)
+    else:
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
+            logger.warning(
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
+            )
+            return False
+        else:
+            platform = Platform.current_platform()
+            platform = str(platform).lower()
+
+        with download_plugin_lib_path(platform) as plugin_lib_path:
+            return load_and_initialize_trtllm_plugin(plugin_lib_path)
+    return False
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 89c94300b7..a71fd1edc4 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -70,6 +70,7 @@ def forward(self, x):
             use_dynamo_tracer=True,
             enable_passes=True,
         )
+        dist.destroy_process_group()
 
 
 if __name__ == "__main__":

From 0909e68905e512487d66916e33cc2e7ab4419dac Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 1 Jul 2025 11:13:08 -0700
Subject: [PATCH 05/15] checks for windows where NCCL backend is not supported

---
 py/torch_tensorrt/dynamo/utils.py            | 28 ++++++--------------
 tests/py/dynamo/distributed/test_nccl_ops.py |  6 +++++
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index c0ddcc8bc5..b1fae05d5b 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -286,19 +286,6 @@ def set_log_level(parent_logger: Any, level: Any) -> None:
     """
     if parent_logger:
         parent_logger.setLevel(level)
-        print("Handlers for parent_logger:", parent_logger.handlers)
-        print("bool check--", parent_logger.hasHandlers())
-        if parent_logger.hasHandlers():
-            ch = logging.StreamHandler()
-            ch.setLevel(logging.DEBUG)  # Allow debug messages on handler
-            formatter = logging.Formatter(
-                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-            )
-            ch.setFormatter(formatter)
-            parent_logger.addHandler(ch)
-            print("Logger level:", parent_logger.level)
-            # print("Parent logger level:", logger.parent.level)
-            print("Root logger level:", logging.getLogger().level)
 
     if ENABLED_FEATURES.torch_tensorrt_runtime:
         if level == logging.DEBUG:
@@ -916,7 +903,6 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
         # Downloading TRT-LLM lib
         base_url = "https://pypi.nvidia.com/tensorrt-llm/"
         download_url = base_url + file_name
-        print("Downloading TRT-LLM wheel")
         try:
             logger.debug(f"Downloading {download_url} ...")
             urllib.request.urlretrieve(download_url, downloaded_file_path)
@@ -968,7 +954,7 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
         yield plugin_lib_path
 
 
-def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
+def load_and_initialize_trtllm_plugin(plugin_lib_path: str, platform: str) -> bool:
     """
     Loads and initializes the TensorRT-LLM plugin from the given shared library path.
 
@@ -978,6 +964,9 @@ def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
     Returns:
         bool: True if successful, False otherwise.
     """
+    if "windows" in platform:
+        logger.info("NCCL backend is not supported on Windows")
+        return False
     try:
         handle = ctypes.CDLL(plugin_lib_path)
         logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
@@ -1033,8 +1022,10 @@ def load_tensorrt_llm() -> bool:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
     plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    platform = Platform.current_platform()
+    platform = str(platform).lower()
     if plugin_lib_path:
-        return load_and_initialize_trtllm_plugin(plugin_lib_path)
+        return load_and_initialize_trtllm_plugin(plugin_lib_path, platform)
     else:
         # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
         use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
@@ -1048,10 +1039,7 @@ def load_tensorrt_llm() -> bool:
                 "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
             )
             return False
-        else:
-            platform = Platform.current_platform()
-            platform = str(platform).lower()
 
         with download_plugin_lib_path(platform) as plugin_lib_path:
-            return load_and_initialize_trtllm_plugin(plugin_lib_path)
+            return load_and_initialize_trtllm_plugin(plugin_lib_path, platform)
     return False
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index a71fd1edc4..abde5d8b76 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -6,6 +6,7 @@
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt._enums import Platform
 
 set_environment_variables_pytest()
 dist.init_process_group(backend="nccl", init_method="env://")
@@ -15,7 +16,12 @@
 
 from conversion.harness import DispatchTestCase
 
+platform_str = str(Platform.current_platform()).lower()
 
+
+@unittest.skipIf(
+    "win" in platform_str, "Skipped on Windows: NCCL backend is not supported."
+)
 class TestGatherNcclOpsConverter(DispatchTestCase):
     @parameterized.expand([8])
     def test_nccl_ops(self, linear_layer_dim):

From fc40865e64b11f1f3c6723225a9716ce2ed8ec0f Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 1 Jul 2025 15:20:56 -0700
Subject: [PATCH 06/15] adding checks for windows and jetson devices

---
 .../conversion/custom_ops_converters.py       |  8 +---
 py/torch_tensorrt/dynamo/utils.py             | 41 +++++++++++++++----
 tests/py/dynamo/distributed/test_nccl_ops.py  | 14 +++++--
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index d7c89fd2b7..aecc99b1f1 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -15,15 +15,11 @@
     tensorrt_fused_nccl_all_gather_op,
     tensorrt_fused_nccl_reduce_scatter_op,
 )
-from torch_tensorrt.dynamo.utils import load_tensorrt_llm
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
-if load_tensorrt_llm():
-    from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-        tensorrt_fused_nccl_all_gather_op,
-        tensorrt_fused_nccl_reduce_scatter_op,
-    )
+if load_tensorrt_llm_for_nccl():
 
     @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
     def fused_nccl_gather(
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index b1fae05d5b..1eeefc129c 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -872,6 +872,29 @@ def is_tegra_platform() -> bool:
     return False
 
 
+def is_platform_supported_for_trtllm(platform: str) -> bool:
+    """
+    Checks if the current platform supports TensorRT-LLM plugins for NCCL backend
+    Returns:
+        bool: True if the platform supports TensorRT-LLM plugins for NCCL backend, False otherwise.
+    Note:
+        TensorRT-LLM plugins for NCCL backend are not supported on:
+        - Windows platforms
+        - Jetson devices (aarch64 architecture)
+    """
+    if "windows" in platform:
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Windows"
+        )
+        return False
+    if "aarch64" in platform:
+        logger.info(
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson devices (aarch64)"
+        )
+        return False
+    return True
+
+
 @contextmanager
 def download_plugin_lib_path(platform: str) -> Iterator[str]:
     """
@@ -922,6 +945,7 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
     if "linux" in platform:
         lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
     else:
+        # This condition is never met though
         lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -954,7 +978,7 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
         yield plugin_lib_path
 
 
-def load_and_initialize_trtllm_plugin(plugin_lib_path: str, platform: str) -> bool:
+def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
     """
     Loads and initializes the TensorRT-LLM plugin from the given shared library path.
 
@@ -964,9 +988,6 @@ def load_and_initialize_trtllm_plugin(plugin_lib_path: str, platform: str) -> bo
     Returns:
         bool: True if successful, False otherwise.
     """
-    if "windows" in platform:
-        logger.info("NCCL backend is not supported on Windows")
-        return False
     try:
         handle = ctypes.CDLL(plugin_lib_path)
         logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
@@ -1012,7 +1033,7 @@ def load_and_initialize_trtllm_plugin(plugin_lib_path: str, platform: str) -> bo
     return False
 
 
-def load_tensorrt_llm() -> bool:
+def load_tensorrt_llm_for_nccl() -> bool:
     """
     Attempts to load the TensorRT-LLM plugin and initialize it.
     Either the env variable TRTLLM_PLUGINS_PATH can specify the path
@@ -1021,11 +1042,15 @@ def load_tensorrt_llm() -> bool:
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
-    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    # Check platform compatibility first
     platform = Platform.current_platform()
     platform = str(platform).lower()
+    if not is_platform_supported_for_trtllm(platform):
+        return False
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+
     if plugin_lib_path:
-        return load_and_initialize_trtllm_plugin(plugin_lib_path, platform)
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)
     else:
         # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
         use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
@@ -1041,5 +1066,5 @@ def load_tensorrt_llm() -> bool:
             return False
 
         with download_plugin_lib_path(platform) as plugin_lib_path:
-            return load_and_initialize_trtllm_plugin(plugin_lib_path, platform)
+            return load_and_initialize_trtllm_plugin(plugin_lib_path)
     return False
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index abde5d8b76..9ae6a03839 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,4 +1,5 @@
 import os
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -19,12 +20,13 @@
 platform_str = str(Platform.current_platform()).lower()
 
 
-@unittest.skipIf(
-    "win" in platform_str, "Skipped on Windows: NCCL backend is not supported."
-)
 class TestGatherNcclOpsConverter(DispatchTestCase):
+    @unittest.skipIf(
+        "win" or "aarch64" in platform_str,
+        "Skipped on Windows and Jetson: NCCL backend is not supported.",
+    )
     @parameterized.expand([8])
-    def test_nccl_ops(self, linear_layer_dim):
+    def test_nccl_ops_gather(self, linear_layer_dim):
         class DistributedGatherModel(nn.Module):
             def __init__(self, input_dim):
                 super().__init__()
@@ -48,6 +50,10 @@ def forward(self, x):
             enable_passes=True,
         )
 
+    @unittest.skipIf(
+        "win" or "aarch64" in platform_str,
+        "Skipped on Windows and Jetson: NCCL backend is not supported.",
+    )
     @parameterized.expand([8])
     def test_nccl_ops_scatter(self, linear_layer_dim):
 

From ac8fcebd0fd07329b3ac4312941985b74c74f2cf Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 3 Jul 2025 17:35:01 -0700
Subject: [PATCH 07/15] Keeping the extracted and deleting download file,
 restructuring test

---
 py/torch_tensorrt/dynamo/utils.py             | 142 ++++++++++--------
 .../dynamo/distributed/distributed_utils.py   |   1 -
 tests/py/dynamo/distributed/test_nccl_ops.py  | 103 +++++++------
 tests/py/dynamo/distributed/test_nccl_ops.sh  |  47 +-----
 4 files changed, 135 insertions(+), 158 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 1eeefc129c..255d803b94 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -8,7 +8,6 @@
 import tempfile
 import urllib.request
 import warnings
-from contextlib import contextmanager
 from dataclasses import fields, replace
 from enum import Enum
 from pathlib import Path
@@ -16,7 +15,6 @@
     Any,
     Callable,
     Dict,
-    Iterator,
     List,
     Optional,
     Sequence,
@@ -895,40 +893,52 @@ def is_platform_supported_for_trtllm(platform: str) -> bool:
     return True
 
 
-@contextmanager
-def download_plugin_lib_path(platform: str) -> Iterator[str]:
-    """
-    Downloads (if needed) and extracts the TensorRT-LLM plugin wheel for the specified platform,
-    then yields the path to the extracted shared library (.so or .dll).
+def _cache_root() -> Path:
+    username = getpass.getuser()
+    return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
 
-    The wheel file is cached in a user-specific temporary directory to avoid repeated downloads.
-    Extraction happens in a temporary directory that is cleaned up after use.
 
-    Args:
-        platform (str): The platform identifier string (e.g., 'linux_x86_64') to select the correct wheel.
+def _extracted_dir_trtllm(platform: str) -> Path:
+    return _cache_root() / "trtllm" / f"{__tensorrt_llm_version__}_{platform}"
 
-    Yields:
-        str: The full path to the extracted TensorRT-LLM shared library file.
 
-    Raises:
-        ImportError: If the 'zipfile' module is not available.
-        RuntimeError: If the wheel file is missing, corrupted, or extraction fails.
+def download_and_get_plugin_lib_path(platform: str) -> Optional[str]:
     """
-    plugin_lib_path = None
-    username = getpass.getuser()
-    torchtrt_cache_dir = Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
-    torchtrt_cache_dir.mkdir(parents=True, exist_ok=True)
-    file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-{_WHL_CPYTHON_VERSION}-{platform}.whl"
-    torchtrt_cache_trtllm_whl = torchtrt_cache_dir / file_name
-    downloaded_file_path = torchtrt_cache_trtllm_whl
-
-    if not torchtrt_cache_trtllm_whl.exists():
-        # Downloading TRT-LLM lib
+    Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
+
+    Args:
+        platform (str): Platform identifier (e.g., 'linux_x86_64')
+
+    Returns:
+        Optional[str]: Path to shared library or None if operation fails.
+    """
+    wheel_filename = (
+        f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
+        f"{_WHL_CPYTHON_VERSION}-{platform}.whl"
+    )
+    wheel_path = _cache_root() / wheel_filename
+    extract_dir = _extracted_dir_trtllm(platform)
+    # else will never be met though
+    lib_filename = (
+        "libnvinfer_plugin_tensorrt_llm.so"
+        if "linux" in platform
+        else "libnvinfer_plugin_tensorrt_llm.dll"
+    )
+    # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
+    plugin_lib_path = extract_dir / "tensorrt_llm" / "libs" / lib_filename
+
+    if plugin_lib_path.exists():
+        return str(plugin_lib_path)
+
+    wheel_path.parent.mkdir(parents=True, exist_ok=True)
+    extract_dir.mkdir(parents=True, exist_ok=True)
+
+    if not wheel_path.exists():
         base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-        download_url = base_url + file_name
+        download_url = base_url + wheel_filename
         try:
             logger.debug(f"Downloading {download_url} ...")
-            urllib.request.urlretrieve(download_url, downloaded_file_path)
+            urllib.request.urlretrieve(download_url, wheel_path)
             logger.debug("Download succeeded and TRT-LLM wheel is now present")
         except urllib.error.HTTPError as e:
             logger.error(
@@ -941,41 +951,45 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
         except OSError as e:
             logger.error(f"Local file write error: {e}")
 
-    # Proceeding with the unzip of the wheel file in tmpdir
-    if "linux" in platform:
-        lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
-    else:
-        # This condition is never met though
-        lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
+    try:
+        import zipfile
+    except ImportError as e:
+        raise ImportError(
+            "zipfile module is required but not found. Please install zipfile"
+        )
+    try:
+        with zipfile.ZipFile(wheel_path) as zip_ref:
+            zip_ref.extractall(extract_dir)
+            logger.debug(f"Extracted wheel to {extract_dir}")
+    except FileNotFoundError as e:
+        # This should capture the errors in the download failure above
+        logger.error(f"Wheel file not found at {wheel_path}: {e}")
+        raise RuntimeError(
+            f"Failed to find downloaded wheel file at {wheel_path}"
+        ) from e
+    except zipfile.BadZipFile as e:
+        logger.error(f"Invalid or corrupted wheel file: {e}")
+        raise RuntimeError(
+            "Downloaded wheel file is corrupted or not a valid zip archive"
+        ) from e
+    except Exception as e:
+        logger.error(f"Unexpected error while extracting wheel: {e}")
+        raise RuntimeError(
+            "Unexpected error during extraction of TensorRT-LLM wheel"
+        ) from e
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        try:
-            import zipfile
-        except ImportError:
-            raise ImportError(
-                "zipfile module is required but not found. Please install zipfile"
-            )
-        try:
-            with zipfile.ZipFile(downloaded_file_path, "r") as zip_ref:
-                zip_ref.extractall(tmpdir)  # Extract to a folder named 'tensorrt_llm'
-        except FileNotFoundError as e:
-            # This should capture the errors in the download failure above
-            logger.error(f"Wheel file not found at {downloaded_file_path}: {e}")
-            raise RuntimeError(
-                f"Failed to find downloaded wheel file at {downloaded_file_path}"
-            ) from e
-        except zipfile.BadZipFile as e:
-            logger.error(f"Invalid or corrupted wheel file: {e}")
-            raise RuntimeError(
-                "Downloaded wheel file is corrupted or not a valid zip archive"
-            ) from e
-        except Exception as e:
-            logger.error(f"Unexpected error while extracting wheel: {e}")
-            raise RuntimeError(
-                "Unexpected error during extraction of TensorRT-LLM wheel"
-            ) from e
-        plugin_lib_path = os.path.join(tmpdir, "tensorrt_llm/libs", lib_filename)
-        yield plugin_lib_path
+    try:
+        wheel_path.unlink(missing_ok=True)
+        logger.debug(f"Deleted wheel file: {wheel_path}")
+    except Exception as e:
+        logger.warning(f"Could not delete wheel file {wheel_path}: {e}")
+    if not plugin_lib_path.exists():
+        logger.error(
+            f"Plugin library not found at expected location: {plugin_lib_path}"
+        )
+        return None
+
+    return str(plugin_lib_path)
 
 
 def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
@@ -1065,6 +1079,6 @@ def load_tensorrt_llm_for_nccl() -> bool:
             )
             return False
 
-        with download_plugin_lib_path(platform) as plugin_lib_path:
-            return load_and_initialize_trtllm_plugin(plugin_lib_path)
+        plugin_lib_path = download_and_get_plugin_lib_path(platform)
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
     return False
diff --git a/tests/py/dynamo/distributed/distributed_utils.py b/tests/py/dynamo/distributed/distributed_utils.py
index e3062249fa..bc058aaaec 100644
--- a/tests/py/dynamo/distributed/distributed_utils.py
+++ b/tests/py/dynamo/distributed/distributed_utils.py
@@ -13,7 +13,6 @@ def set_environment_variables_pytest():
     os.environ["RANK"] = str(0)
     os.environ["MASTER_ADDR"] = "127.0.0.1"
     os.environ["MASTER_PORT"] = str(29500)
-    os.environ["USE_TRTLLM_PLUGINS"] = "1"
 
 
 def initialize_logger(rank, logger_file_name):
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 9ae6a03839..91bcc56f44 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -4,18 +4,42 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from conversion.harness import DispatchTestCase
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
 from torch_tensorrt._enums import Platform
 
-set_environment_variables_pytest()
-dist.init_process_group(backend="nccl", init_method="env://")
-group = dist.new_group(ranks=[0])
-group_name = group.group_name
-world_size = 1
 
-from conversion.harness import DispatchTestCase
+class DistributedGatherModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
+
+    def forward(self, x):
+        x = self.fc(x)
+        gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+            x, self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(gathered_tensor)
+
+
+class DistributedReduceScatterModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
+
+    def forward(self, x):
+        x = self.fc(x)
+        out = torch.ops._c10d_functional.reduce_scatter_tensor(
+            x, "sum", self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(out)
+
 
 platform_str = str(Platform.current_platform()).lower()
 
@@ -25,64 +49,49 @@ class TestGatherNcclOpsConverter(DispatchTestCase):
         "win" or "aarch64" in platform_str,
         "Skipped on Windows and Jetson: NCCL backend is not supported.",
     )
+    @classmethod
+    def setUpClass(cls):
+        set_environment_variables_pytest()
+        print("USE_TRTLLM_PLUGINS =", os.environ.get("USE_TRTLLM_PLUGINS"))
+        cls.world_size = 1
+        if not dist.is_initialized():
+            dist.init_process_group(
+                backend="nccl",
+                init_method="env://",
+                world_size=cls.world_size,
+                rank=0,  # or read from env
+            )
+        cls.group = dist.new_group(ranks=[0])
+        cls.group_name = cls.group.group_name
+
+    @classmethod
+    def tearDownClass(cls):
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):
-        class DistributedGatherModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
-                    x, world_size, group_name
-                )
-                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
-                    gathered_tensor
-                )
-                return gathered_tensor
-
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
         self.run_test(
-            DistributedGatherModel(linear_layer_dim).cuda(),
+            DistributedGatherModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
         )
 
-    @unittest.skipIf(
-        "win" or "aarch64" in platform_str,
-        "Skipped on Windows and Jetson: NCCL backend is not supported.",
-    )
     @parameterized.expand([8])
     def test_nccl_ops_scatter(self, linear_layer_dim):
-
-        class DistributedReduceScatterModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                scatter_reduce_tensor = (
-                    torch.ops._c10d_functional.reduce_scatter_tensor(
-                        x, "sum", world_size, group_name
-                    )
-                )
-                scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(
-                    scatter_reduce_tensor
-                )
-                return scatter_reduce_tensor
-
         inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
-
         self.run_test(
-            DistributedReduceScatterModel(linear_layer_dim).cuda(),
+            DistributedReduceScatterModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
         )
-        dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.sh b/tests/py/dynamo/distributed/test_nccl_ops.sh
index dd54700048..677d0cb9bc 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.sh
+++ b/tests/py/dynamo/distributed/test_nccl_ops.sh
@@ -70,51 +70,6 @@ ensure_pytest_installed(){
 
 echo "Setting up the environment"
 
-OS="$(uname -s)"
-ARCH="$(uname -m)"
-
-
-#getting the file name for TensorRT-LLM download
-if [[ "$OS" == "Linux" && "$ARCH" == "x86_64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_x86_64.whl"
-elif [[ "$OS" == "Linux" && "$ARCH" == "aarch64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_aarch64.whl"
-else:
-    echo "Unsupported platform: OS=$OS ARCH=$ARCH
-    exit 1
-fi
-
-# Download the selected file
-URL="https://pypi.nvidia.com/tensorrt-llm/$FILE"
-echo "Downloading $FILE from $URL..."
-
-#Installing wget
-ensure_installed wget
-
-#Downloading the file
-filename=$(basename "$URL")
-if [ -f "$filename" ]; then
-    echo "File already exists: $filename"
-else
-    wget "$URL"
-fi
-echo "Download complete: $FILE"
-
-UNZIP_DIR="tensorrt_llm_unzip"
-if [[ ! -d "$UNZIP_DIR" ]]; then
-    echo "Creating directory: $UNZIP_DIR"
-    mkdir -p "$UNZIP_DIR"
-    echo "extracting $FILE to $UNZIP_DIR ..."
-    #Installing unzip
-    ensure_installed unzip
-    #unzip the TensorRT-LLM package
-    unzip -q "$FILE" -d "$UNZIP_DIR"
-    echo "Unzip complete"
-fi
-
-
-export TRTLLM_PLUGINS_PATH="$(pwd)/${UNZIP_DIR}/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"
-echo ${TRTLLM_PLUGINS_PATH}
 
 ensure_mpi_installed libmpich-dev
 ensure_mpi_installed libopenmpi-dev
@@ -123,7 +78,7 @@ run_tests() {
     cd ..
     export PYTHONPATH=$(pwd)
     echo "Running pytest on distributed/test_nccl_ops.py..."
-    pytest distributed/test_nccl_ops.py
+    USE_TRTLLM_PLUGINS=1 pytest distributed/test_nccl_ops.py
 }
 
 run_mpi_tests(){

From 34ba5777c5a27e270ca515cb7eb120324ea0d64a Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 7 Jul 2025 11:46:36 -0700
Subject: [PATCH 08/15] modifying the error warning of missing libmpi libs

---
 py/torch_tensorrt/dynamo/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 255d803b94..0173187741 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1008,9 +1008,7 @@ def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
     except OSError as e_os_error:
         if "libmpi" in str(e_os_error):
             logger.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
-                f"The dependency libmpi.so is missing. "
-                f"Please install the packages libmpich-dev and libopenmpi-dev.",
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}, got error {e_os_error} (hint: libmpi.so is a necessary dependency; ensure that OpenMPI or MPICH is installed on your system)",
                 exc_info=e_os_error,
             )
         else:

From 85b4fbb6948cab128fe7a134de89db86025bcf40 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 7 Jul 2025 11:54:51 -0700
Subject: [PATCH 09/15] removing the redundant initializations

---
 tests/py/dynamo/distributed/test_nccl_ops.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 91bcc56f44..e8bca66efe 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -55,12 +55,7 @@ def setUpClass(cls):
         print("USE_TRTLLM_PLUGINS =", os.environ.get("USE_TRTLLM_PLUGINS"))
         cls.world_size = 1
         if not dist.is_initialized():
-            dist.init_process_group(
-                backend="nccl",
-                init_method="env://",
-                world_size=cls.world_size,
-                rank=0,  # or read from env
-            )
+            dist.init_process_group(backend="nccl")
         cls.group = dist.new_group(ranks=[0])
         cls.group_name = cls.group.group_name
 

From 95737c06d52d564839123a09c7446d0360e4635a Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 17 Jul 2025 12:29:45 -0700
Subject: [PATCH 10/15] adding tests in CI

---
 .github/workflows/build-test-linux-x86_64.yml | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index 51f3730d02..dc67cba06a 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -337,6 +337,37 @@ jobs:
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
         popd
 
+  tests-py-distributed:
+    name: Test dynamo distributed [Python]
+    needs: [filter-matrix, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: tests-py-dynamo-distributed
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
   cancel-in-progress: true

From 2aa94ca1a0b329b9c6fb1f5ebdc14aa7143a619f Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 21 Jul 2025 23:15:14 -0700
Subject: [PATCH 11/15] correcting the skip test condition

---
 .github/workflows/build-test-linux-x86_64.yml | 1 +
 tests/py/dynamo/distributed/test_nccl_ops.py  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index dc67cba06a..122b929f50 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -362,6 +362,7 @@ jobs:
         set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
         pushd .
         cd tests/py
         cd dynamo
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index e8bca66efe..c48a2d17d2 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -46,7 +46,7 @@ def forward(self, x):
 
 class TestGatherNcclOpsConverter(DispatchTestCase):
     @unittest.skipIf(
-        "win" or "aarch64" in platform_str,
+        "win" in platform_str or "aarch64" in platform_str,
         "Skipped on Windows and Jetson: NCCL backend is not supported.",
     )
     @classmethod

From 74e3ca472feac608f6354b8c499e19022075d4e5 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 24 Jul 2025 10:41:53 -0700
Subject: [PATCH 12/15] install MPI libs for linux x86

---
 .github/scripts/install-mpi-linux-x86.sh  | 4 ++++
 .github/scripts/install-torch-tensorrt.sh | 6 ++++++
 2 files changed, 10 insertions(+)
 create mode 100644 .github/scripts/install-mpi-linux-x86.sh

diff --git a/.github/scripts/install-mpi-linux-x86.sh b/.github/scripts/install-mpi-linux-x86.sh
new file mode 100644
index 0000000000..6c78f98551
--- /dev/null
+++ b/.github/scripts/install-mpi-linux-x86.sh
@@ -0,0 +1,4 @@
+install_mpi_linux_x86() {
+    echo "install mpi for x86"
+    dnf install -y mpich mpich-devel openmpi openmpi-devel
+}
\ No newline at end of file
diff --git a/.github/scripts/install-torch-tensorrt.sh b/.github/scripts/install-torch-tensorrt.sh
index 94de5f022a..115931e755 100755
--- a/.github/scripts/install-torch-tensorrt.sh
+++ b/.github/scripts/install-torch-tensorrt.sh
@@ -12,6 +12,12 @@ if [[ $(uname -m) == "aarch64" ]]; then
     install_cuda_aarch64
 fi
 
+if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
+    # install MPI for Linux x86_64
+    source .github/scripts/install-mpi-linux-x86.sh
+    install_mpi_linux_x86
+fi
+
 # Install all the dependencies required for Torch-TensorRT
 pip install --pre -r ${PWD}/tests/py/requirements.txt
 # dependencies in the tests/py/requirements.txt might install a different version of torch or torchvision

From 26e244fb78144bf54b554714d05f15ec5d70398a Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 31 Jul 2025 14:17:14 -0700
Subject: [PATCH 13/15] adding SBSA to the supported platform of TRT-LLM libs
 and installing MPI libs for the distributed tests

---
 .github/scripts/install-mpi-linux-x86.sh      |  4 ----
 .github/scripts/install-torch-tensorrt.sh     |  6 ------
 .github/workflows/build-test-linux-x86_64.yml |  1 +
 py/torch_tensorrt/dynamo/utils.py             | 11 +++++++----
 tests/py/dynamo/distributed/test_nccl_ops.py  |  7 ++++---
 5 files changed, 12 insertions(+), 17 deletions(-)
 delete mode 100644 .github/scripts/install-mpi-linux-x86.sh

diff --git a/.github/scripts/install-mpi-linux-x86.sh b/.github/scripts/install-mpi-linux-x86.sh
deleted file mode 100644
index 6c78f98551..0000000000
--- a/.github/scripts/install-mpi-linux-x86.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-install_mpi_linux_x86() {
-    echo "install mpi for x86"
-    dnf install -y mpich mpich-devel openmpi openmpi-devel
-}
\ No newline at end of file
diff --git a/.github/scripts/install-torch-tensorrt.sh b/.github/scripts/install-torch-tensorrt.sh
index 115931e755..94de5f022a 100755
--- a/.github/scripts/install-torch-tensorrt.sh
+++ b/.github/scripts/install-torch-tensorrt.sh
@@ -12,12 +12,6 @@ if [[ $(uname -m) == "aarch64" ]]; then
     install_cuda_aarch64
 fi
 
-if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
-    # install MPI for Linux x86_64
-    source .github/scripts/install-mpi-linux-x86.sh
-    install_mpi_linux_x86
-fi
-
 # Install all the dependencies required for Torch-TensorRT
 pip install --pre -r ${PWD}/tests/py/requirements.txt
 # dependencies in the tests/py/requirements.txt might install a different version of torch or torchvision
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index 122b929f50..ccf3de5e4a 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -363,6 +363,7 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
         pushd .
         cd tests/py
         cd dynamo
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 0173187741..0b7aa33ac8 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -878,18 +878,21 @@ def is_platform_supported_for_trtllm(platform: str) -> bool:
     Note:
         TensorRT-LLM plugins for NCCL backend are not supported on:
         - Windows platforms
-        - Jetson devices (aarch64 architecture)
+        - Orin, Xavier, or Tegra devices (aarch64 architecture)
+
     """
     if "windows" in platform:
         logger.info(
             "TensorRT-LLM plugins for NCCL backend are not supported on Windows"
         )
         return False
-    if "aarch64" in platform:
+    if torch.cuda.is_available():
+        device_name = torch.cuda.get_device_name().lower()
+        if any(keyword in device_name for keyword in ["orin", "xavier", "tegra"]):
+            return False
         logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson devices (aarch64)"
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson devices"
         )
-        return False
     return True
 
 
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index c48a2d17d2..40c728b1b9 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -9,6 +9,7 @@
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
 from torch_tensorrt._enums import Platform
+from torch_tensorrt.dynamo.utils import is_platform_supported_for_trtllm
 
 
 class DistributedGatherModel(nn.Module):
@@ -44,10 +45,10 @@ def forward(self, x):
 platform_str = str(Platform.current_platform()).lower()
 
 
-class TestGatherNcclOpsConverter(DispatchTestCase):
+class TestNcclOpsConverter(DispatchTestCase):
     @unittest.skipIf(
-        "win" in platform_str or "aarch64" in platform_str,
-        "Skipped on Windows and Jetson: NCCL backend is not supported.",
+        not is_platform_supported_for_trtllm(platform_str),
+        "Skipped on Windows, Jetson: NCCL backend is not supported.",
     )
     @classmethod
     def setUpClass(cls):

From 7bb105d8ce916539c13c690c077687c2117cfd0a Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Fri, 8 Aug 2025 08:45:19 -0700
Subject: [PATCH 14/15] Using python package for platform detection

---
 .../workflows/build-test-linux-aarch64.yml    | 35 ++++++++
 py/torch_tensorrt/dynamo/utils.py             | 80 ++++++-------------
 tests/py/dynamo/distributed/test_nccl_ops.py  |  6 +-
 3 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/build-test-linux-aarch64.yml b/.github/workflows/build-test-linux-aarch64.yml
index 765e3fc2c4..66963a098e 100644
--- a/.github/workflows/build-test-linux-aarch64.yml
+++ b/.github/workflows/build-test-linux-aarch64.yml
@@ -356,6 +356,41 @@ jobs:
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
         popd
 
+  tests-py-distributed:
+    name: Test dynamo distributed [Python]
+    needs: [filter-matrix, build]
+    if: false
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: tests-py-dynamo-distributed
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
   cancel-in-progress: true
\ No newline at end of file
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 0b7aa33ac8..daad7a854b 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -5,6 +5,7 @@
 import getpass
 import logging
 import os
+import platform
 import tempfile
 import urllib.request
 import warnings
@@ -29,7 +30,7 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._Device import Device
-from torch_tensorrt._enums import Platform, dtype
+from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
 from torch_tensorrt._version import __tensorrt_llm_version__
@@ -101,37 +102,6 @@ class Frameworks(Enum):
     }
 
 
-def unified_dtype_converter(
-    dtype: Union[TRTDataType, torch.dtype, np.dtype], to: Frameworks
-) -> Union[np.dtype, torch.dtype, TRTDataType]:
-    """
-    Convert TensorRT, Numpy, or Torch data types to any other of those data types.
-
-    Args:
-        dtype (TRTDataType, torch.dtype, np.dtype): A TensorRT, Numpy, or Torch data type.
-        to (Frameworks): The framework to convert the data type to.
-
-    Returns:
-        The equivalent data type in the requested framework.
-    """
-    assert to in Frameworks, f"Expected valid Framework for translation, got {to}"
-    trt_major_version = int(trt.__version__.split(".")[0])
-    if dtype in (np.int8, torch.int8, trt.int8):
-        return DataTypeEquivalence[trt.int8][to]
-    elif trt_major_version >= 7 and dtype in (np.bool_, torch.bool, trt.bool):
-        return DataTypeEquivalence[trt.bool][to]
-    elif dtype in (np.int32, torch.int32, trt.int32):
-        return DataTypeEquivalence[trt.int32][to]
-    elif dtype in (np.int64, torch.int64, trt.int64):
-        return DataTypeEquivalence[trt.int64][to]
-    elif dtype in (np.float16, torch.float16, trt.float16):
-        return DataTypeEquivalence[trt.float16][to]
-    elif dtype in (np.float32, torch.float32, trt.float32):
-        return DataTypeEquivalence[trt.float32][to]
-    else:
-        raise TypeError("%s is not a supported dtype" % dtype)
-
-
 def deallocate_module(module: torch.fx.GraphModule, delete_module: bool = True) -> None:
     """
     This is a helper function to delete the instance of module. We first move it to CPU and then
@@ -870,29 +840,33 @@ def is_tegra_platform() -> bool:
     return False
 
 
-def is_platform_supported_for_trtllm(platform: str) -> bool:
+def is_platform_supported_for_trtllm() -> bool:
     """
-    Checks if the current platform supports TensorRT-LLM plugins for NCCL backend
+    Checks if the current platform supports TensorRT-LLM plugins for the NCCL backend.
+
     Returns:
-        bool: True if the platform supports TensorRT-LLM plugins for NCCL backend, False otherwise.
-    Note:
-        TensorRT-LLM plugins for NCCL backend are not supported on:
-        - Windows platforms
-        - Orin, Xavier, or Tegra devices (aarch64 architecture)
+        bool: True if supported, False otherwise.
 
+    Unsupported:
+        - Windows platforms
+        - Jetson/Orin/Xavier (aarch64 architecture + 'tegra' in platform release)
     """
-    if "windows" in platform:
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+    release = platform.release().lower()
+
+    if "windows" in system:
         logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Windows"
+            "TensorRT-LLM plugins for NCCL backend are not supported on Windows."
         )
         return False
-    if torch.cuda.is_available():
-        device_name = torch.cuda.get_device_name().lower()
-        if any(keyword in device_name for keyword in ["orin", "xavier", "tegra"]):
-            return False
+
+    if machine == "aarch64" and "tegra" in release:
         logger.info(
-            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson devices"
+            "TensorRT-LLM plugins for NCCL backend are not supported on Jetson/Orin/Xavier (Tegra) devices."
         )
+        return False
+
     return True
 
 
@@ -905,7 +879,7 @@ def _extracted_dir_trtllm(platform: str) -> Path:
     return _cache_root() / "trtllm" / f"{__tensorrt_llm_version__}_{platform}"
 
 
-def download_and_get_plugin_lib_path(platform: str) -> Optional[str]:
+def download_and_get_plugin_lib_path() -> Optional[str]:
     """
     Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
 
@@ -919,12 +893,13 @@ def download_and_get_plugin_lib_path(platform: str) -> Optional[str]:
         f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
         f"{_WHL_CPYTHON_VERSION}-{platform}.whl"
     )
+    platform_system = platform.system().lower()
     wheel_path = _cache_root() / wheel_filename
-    extract_dir = _extracted_dir_trtllm(platform)
+    extract_dir = _extracted_dir_trtllm(platform_system)
     # else will never be met though
     lib_filename = (
         "libnvinfer_plugin_tensorrt_llm.so"
-        if "linux" in platform
+        if "linux" in platform_system
         else "libnvinfer_plugin_tensorrt_llm.dll"
     )
     # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
@@ -1057,10 +1032,7 @@ def load_tensorrt_llm_for_nccl() -> bool:
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
-    # Check platform compatibility first
-    platform = Platform.current_platform()
-    platform = str(platform).lower()
-    if not is_platform_supported_for_trtllm(platform):
+    if not is_platform_supported_for_trtllm():
         return False
     plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
 
@@ -1080,6 +1052,6 @@ def load_tensorrt_llm_for_nccl() -> bool:
             )
             return False
 
-        plugin_lib_path = download_and_get_plugin_lib_path(platform)
+        plugin_lib_path = download_and_get_plugin_lib_path()
         return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
     return False
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 40c728b1b9..4f7997b242 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -8,7 +8,6 @@
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
-from torch_tensorrt._enums import Platform
 from torch_tensorrt.dynamo.utils import is_platform_supported_for_trtllm
 
 
@@ -42,12 +41,9 @@ def forward(self, x):
         return torch.ops._c10d_functional.wait_tensor(out)
 
 
-platform_str = str(Platform.current_platform()).lower()
-
-
 class TestNcclOpsConverter(DispatchTestCase):
     @unittest.skipIf(
-        not is_platform_supported_for_trtllm(platform_str),
+        not is_platform_supported_for_trtllm(),
         "Skipped on Windows, Jetson: NCCL backend is not supported.",
     )
     @classmethod

From 86d39ad37c48f0032d4695feed1398f3be52bc68 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 12 Aug 2025 17:47:07 -0700
Subject: [PATCH 15/15] using python platform

---
 py/torch_tensorrt/dynamo/utils.py            | 44 +++++++++++++++++---
 tests/py/dynamo/distributed/test_nccl_ops.py |  1 -
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index daad7a854b..39f0c1dd1a 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -102,6 +102,35 @@ class Frameworks(Enum):
     }
 
 
+def unified_dtype_converter(
+    dtype: Union[TRTDataType, torch.dtype, np.dtype], to: Frameworks
+) -> Union[np.dtype, torch.dtype, TRTDataType]:
+    """
+    Convert TensorRT, Numpy, or Torch data types to any other of those data types.
+    Args:
+        dtype (TRTDataType, torch.dtype, np.dtype): A TensorRT, Numpy, or Torch data type.
+        to (Frameworks): The framework to convert the data type to.
+    Returns:
+        The equivalent data type in the requested framework.
+    """
+    assert to in Frameworks, f"Expected valid Framework for translation, got {to}"
+    trt_major_version = int(trt.__version__.split(".")[0])
+    if dtype in (np.int8, torch.int8, trt.int8):
+        return DataTypeEquivalence[trt.int8][to]
+    elif trt_major_version >= 7 and dtype in (np.bool_, torch.bool, trt.bool):
+        return DataTypeEquivalence[trt.bool][to]
+    elif dtype in (np.int32, torch.int32, trt.int32):
+        return DataTypeEquivalence[trt.int32][to]
+    elif dtype in (np.int64, torch.int64, trt.int64):
+        return DataTypeEquivalence[trt.int64][to]
+    elif dtype in (np.float16, torch.float16, trt.float16):
+        return DataTypeEquivalence[trt.float16][to]
+    elif dtype in (np.float32, torch.float32, trt.float32):
+        return DataTypeEquivalence[trt.float32][to]
+    else:
+        raise TypeError("%s is not a supported dtype" % dtype)
+
+
 def deallocate_module(module: torch.fx.GraphModule, delete_module: bool = True) -> None:
     """
     This is a helper function to delete the instance of module. We first move it to CPU and then
@@ -875,8 +904,12 @@ def _cache_root() -> Path:
     return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
 
 
-def _extracted_dir_trtllm(platform: str) -> Path:
-    return _cache_root() / "trtllm" / f"{__tensorrt_llm_version__}_{platform}"
+def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
+    return (
+        _cache_root()
+        / "trtllm"
+        / f"{__tensorrt_llm_version__}_{platform_system}_{platform_machine}"
+    )
 
 
 def download_and_get_plugin_lib_path() -> Optional[str]:
@@ -889,13 +922,14 @@ def download_and_get_plugin_lib_path() -> Optional[str]:
     Returns:
         Optional[str]: Path to shared library or None if operation fails.
     """
+    platform_system = platform.system().lower()
+    platform_machine = platform.machine().lower()
     wheel_filename = (
         f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
-        f"{_WHL_CPYTHON_VERSION}-{platform}.whl"
+        f"{_WHL_CPYTHON_VERSION}-{platform_system}_{platform_machine}.whl"
     )
-    platform_system = platform.system().lower()
     wheel_path = _cache_root() / wheel_filename
-    extract_dir = _extracted_dir_trtllm(platform_system)
+    extract_dir = _extracted_dir_trtllm(platform_system, platform_machine)
     # else will never be met though
     lib_filename = (
         "libnvinfer_plugin_tensorrt_llm.so"
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
index 4f7997b242..3043954ece 100644
--- a/tests/py/dynamo/distributed/test_nccl_ops.py
+++ b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -49,7 +49,6 @@ class TestNcclOpsConverter(DispatchTestCase):
     @classmethod
     def setUpClass(cls):
         set_environment_variables_pytest()
-        print("USE_TRTLLM_PLUGINS =", os.environ.get("USE_TRTLLM_PLUGINS"))
         cls.world_size = 1
         if not dist.is_initialized():
             dist.init_process_group(backend="nccl")