NVIDIA
diff --git a/‎docs/source/torch.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/torch.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/torch/adding_new_model.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/torch/adding_new_model.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/torch/arch_overview.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/torch/arch_overview.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/apps/chat.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/apps/chat.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/apps/fastapi_server.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/apps/fastapi_server.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/llm-api/llm_auto_parallel.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/llm-api/llm_auto_parallel.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/llm-api/llm_eagle2_decoding.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/llm-api/llm_eagle2_decoding.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llm-api/llm_eagle_decoding.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/llm-api/llm_eagle_decoding.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/llm-api/llm_guided_decoding.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/llm-api/llm_guided_decoding.py‎
Lines changed: 2 additions & 1 deletion
@@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
 
 ## Quick Start
 
-Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
+Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
 
 ```{literalinclude} ../../examples/pytorch/quickstart.py
     :language: python
@@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
 which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 llm.generate("Hello, my name is")
 ```
@@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
 In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
           enable_trtllm_sampler=True)
 sampling_params = SamplingParams(
 
@@ -186,7 +186,7 @@ __all__ = [
 Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 import modeling_mymodel
 
 def main():
 
@@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
 
 ## Top Level API
 
-The interface for PyTorch backend is `tensorrt._torch.LLM`.
+The interface for PyTorch backend is `tensorrt_llm.LLM`.
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model=<path_to_llama_from_hf>)
 ```
 
 
@@ -5,7 +5,8 @@
 import colorama
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 
 class LlmConsole(code.InteractiveConsole):
 
@@ -18,8 +18,9 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import CppExecutorError, RequestError
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 
 
@@ -7,11 +7,12 @@
 import torch
 from simple_config import SimpleConfig
 
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
 from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
 from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-from tensorrt_llm.llmapi.llm import LLM, RequestOutput
+from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
 
@@ -1,5 +1,6 @@
 ### Automatic Parallelism with LLM
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 
 
 def main():
 
@@ -1,7 +1,7 @@
 ### Generate Text Using Eagle2 Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  SamplingParams)
 
 
 
@@ -1,8 +1,8 @@
 ### Generate Text Using Eagle Decoding
 
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
-                                 SamplingParams)
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 
 
 def main():
 
@@ -1,5 +1,6 @@
 ### Generate text with guided decoding
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import GuidedDecodingParams