Refactor imports inside tensorrt_llm._torch. (NVIDIA#3015)

yuxianq · web-flow · commit 268933b5ccd1 · 2025-03-26T11:01:07.000+08:00
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/star_flashinfer.py b/tensorrt_llm/_torch/attention_backend/star_flashinfer.py
@@ -1,9 +1,17 @@
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+import flashinfer
 import numpy as np
 import torch
 
+from tensorrt_llm.functional import AttentionMaskType
+from tensorrt_llm.models.modeling_utils import QuantConfig
+
 from ..distributed import allgather
 from ..modules.linear import ParallelConfig
-from .flashinfer import *
+from .flashinfer import FlashInferAttentionMetadata, PlanParams
+from .interface import AttentionBackend, AttentionMask, PredefinedAttentionMask
 from .vanilla import VanillaAttention
 
 
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -5,15 +5,16 @@
 
 import torch
 
-from tensorrt_llm._torch.attention_backend.interface import (
-    AttentionBackend, AttentionMask, AttentionMetadata, MLAParams,
-    PositionalEmbeddingParams, PredefinedAttentionMask, RopeParams)
-from tensorrt_llm._torch.attention_backend.vanilla import VanillaAttention
 from tensorrt_llm.functional import (AttentionMaskType, RopeEmbeddingUtils,
                                      RotaryScalingType)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
+from .interface import (AttentionBackend, AttentionMask, AttentionMetadata,
+                        MLAParams, PositionalEmbeddingParams,
+                        PredefinedAttentionMask, RopeParams)
+from .vanilla import VanillaAttention
+
 
 # The type of requests in qkv passed to attention
 # Please keep sync with AttentionInputType in cpp/tensorrt_llm/thop/attentionOp.cpp
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py
@@ -2,8 +2,8 @@
 
 import torch
 
-import tensorrt_llm._torch.auto_deploy.distributed.common as dist
-import tensorrt_llm._torch.auto_deploy.distributed.trtllm as trtllm_dist
+from ..distributed import common as dist
+from ..distributed import trtllm as trtllm_dist
 
 
 @torch.library.custom_op("dist::all_gather", mutates_args=(), device_types="cuda")
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm._torch.modules.fused_moe import FusedMoE  # noqa: F401
+from ...modules.fused_moe import FusedMoE  # noqa: F401
 
 
 @torch.library.custom_op("moe::torch_moe", mutates_args=())
diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
@@ -4,12 +4,8 @@
 
 # use trtllm distributed ops to improve TP performance if possible
 try:
-    from tensorrt_llm._torch.distributed import AllReduce, allgather
-    from tensorrt_llm._torch.modules.linear import (
-        AllReduceFusionOp,
-        AllReduceParams,
-        ParallelConfig,
-    )
+    from ...distributed import AllReduce, allgather
+    from ...modules.linear import AllReduceFusionOp, AllReduceParams, ParallelConfig
 
     def trtllm_allgather(tensor, dim):
         rank, world_size = get_rank_world_size()
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -15,7 +15,7 @@
 )
 
 try:
-    from tensorrt_llm._torch.quantization.utils import float4_sf_dtype
+    from ...quantization.utils import float4_sf_dtype
 except ImportError:
     float4_sf_dtype = None
 
diff --git a/tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py b/tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py
@@ -8,7 +8,8 @@
                                              register_replacement)
 
 import tensorrt_llm
-from tensorrt_llm._torch.distributed import AllReduceFusionOp
+
+from ...distributed import AllReduceFusionOp
 
 aten = torch.ops.aten
 from tensorrt_llm.mapping import Mapping
diff --git a/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py b/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py
@@ -8,7 +8,8 @@
                                              register_replacement)
 
 import tensorrt_llm
-from tensorrt_llm._torch.distributed import AllReduceFusionOp, AllReduceStrategy
+
+from ...distributed import AllReduceFusionOp, AllReduceStrategy
 
 aten = torch.ops.aten
 from tensorrt_llm.mapping import Mapping
diff --git a/tensorrt_llm/_torch/models/modeling_bert.py b/tensorrt_llm/_torch/models/modeling_bert.py
@@ -4,7 +4,6 @@
 from torch import nn
 from transformers import BertConfig
 
-from tensorrt_llm._torch.modules.linear import Linear
 from tensorrt_llm.llmapi.utils import print_colored_debug
 from tensorrt_llm.logger import logger
 
@@ -14,6 +13,7 @@
 from ..modules.attention import Attention
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
+from ..modules.linear import Linear
 from .modeling_utils import register_auto_model
 
 
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -9,15 +9,14 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
-from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
-                                             AllReduceParams, DeepseekAllReduce,
-                                             ParallelConfig, allgather,
-                                             reducescatter)
 from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.llmapi.utils import enable_llm_debug
 
-from ...llmapi.utils import enable_llm_debug
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
+from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams,
+                           DeepseekAllReduce, ParallelConfig, allgather,
+                           reducescatter)
 from ..model_config import ModelConfig
 from ..models.modeling_utils import MissingLayer, ModelConfig, support_pp
 from ..modules.attention import MLA
@@ -263,7 +262,7 @@ def __init__(self,
                  dtype: Optional[torch.dtype] = None,
                  tune_max_num_tokens: int = 8192,
                  model_config: ModelConfig = ModelConfig()):
-        from tensorrt_llm._torch.distributed import AllReduce
+        from ..distributed import AllReduce
 
         super().__init__()
         config = model_config.pretrained_config
diff --git a/tensorrt_llm/_torch/models/modeling_mixtral.py b/tensorrt_llm/_torch/models/modeling_mixtral.py
@@ -4,11 +4,11 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from tensorrt_llm._torch.distributed import ParallelConfig, allgather
 from tensorrt_llm.functional import PositionEmbeddingType
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
+from ..distributed import ParallelConfig, allgather
 from ..model_config import ModelConfig
 from ..models.modeling_utils import ModelConfig
 from ..modules.attention import Attention
diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py
@@ -216,7 +216,7 @@ def __init__(
             tune_max_num_tokens: int = 8192,
             model_config: ModelConfig = ModelConfig(),
     ):
-        from tensorrt_llm._torch.distributed import AllReduce
+        from ..distributed import AllReduce
 
         super().__init__()
         self.routing_method = routing_method
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -140,7 +140,7 @@ def __init__(self,
                  is_expert: bool = False,
                  skip_create_weights: bool = False,
                  use_custom_cublas_mm: bool = False):
-        from tensorrt_llm._torch.distributed import AllReduce
+        from ..distributed import AllReduce
 
         super().__init__()
         self.has_bias = bias
@@ -379,7 +379,7 @@ def forward(
             *,
             all_reduce_params: Optional[AllReduceParams] = None
     ) -> torch.Tensor:
-        from tensorrt_llm._torch.distributed import allgather
+        from ..distributed import allgather
 
         if self.tp_mode == TensorParallelMode.ROW:
             bias = None if (self.tp_rank > 0) else self.bias
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -3,14 +3,13 @@
 
 import torch
 
-import tensorrt_llm
-from tensorrt_llm._torch.pyexecutor.model_engine import PyTorchModelEngine
 from tensorrt_llm.bindings.executor import ExecutorConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
 from ..speculative import get_spec_resource_manager
-from .llm_request import LlmRequest
+from .llm_request import LlmRequest, SamplingConfig
+from .model_engine import PyTorchModelEngine
 from .resource_manager import ResourceManager
 from .scheduler import ScheduledRequests
 
@@ -96,8 +95,7 @@ def _create_dummy_context(req_id: int, input_len: int, vocab_size: int):
             random.randint(0, vocab_size - 1) for _ in range(input_len)
         ],
         position_ids=list(range(input_len)),
-        sampling_config=tensorrt_llm.bindings.SamplingConfig(
-            sampling_params._get_sampling_config()),
+        sampling_config=SamplingConfig(sampling_params._get_sampling_config()),
         is_streaming=False,
     )
     result.paged_kv_block_ids = []
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -3,11 +3,10 @@
 
 import torch
 
-from tensorrt_llm._torch.attention_backend.interface import AttentionMetadata
-from tensorrt_llm._torch.speculative.interface import SpecMetadata
-from tensorrt_llm._torch.utils import make_weak_ref
-
+from ..attention_backend.interface import AttentionMetadata
 from ..pipeline_interface import PipelineInterface
+from ..speculative.interface import SpecMetadata
+from ..utils import make_weak_ref
 
 _local = threading.local()
 
diff --git a/tensorrt_llm/_torch/pyexecutor/decoder.py b/tensorrt_llm/_torch/pyexecutor/decoder.py
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`)`
`16`	`16`
`17`	`17`	`try:`
`18`		`- from tensorrt_llm._torch.quantization.utils import float4_sf_dtype`
	`18`	`+ from ...quantization.utils import float4_sf_dtype`
`19`	`19`	`except ImportError:`
`20`	`20`	`float4_sf_dtype = None`
`21`	`21`