feat: Add support for Groot N1.5 model

peri044 · peri044 · commit 5f34cfd1331d · 2025-07-31T02:17:35.000Z
Signed-off-by: Dheeraj Peri &lt;peri.dheeraj@gmail.com&gt;
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
@@ -48,17 +48,25 @@ def matrix_multiply(
     input, other = broadcast(
         ctx, input, other, f"{name}_input", f"{name}_other", preset_diff
     )
-    if ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED):
-        promoted_type = _enums.dtype._from(
-            torch.promote_types(
-                _enums.dtype._from(input.dtype).to(torch.dtype),
-                _enums.dtype._from(other.dtype).to(torch.dtype),
-            )
+    if (
+        ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
+        and ctx.compilation_settings.use_fp32_acc
+    ):
+        input = cast_trt_tensor(ctx, input, torch.float32, f"{name}_input_casted")
+        other = cast_trt_tensor(ctx, other, torch.float32, f"{name}_other_casted")
+
+    matmul_layer = ctx.net.add_matrix_multiply(
+        input, input_matrix_op, other, other_matrix_op
+    )
+    matmul_output = matmul_layer.get_output(0)
+
+    if (
+        ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
+        and ctx.compilation_settings.use_fp32_acc
+    ):
+        matmul_output = cast_trt_tensor(
+            ctx, matmul_output, torch.float16, f"{name}_output_casted"
         )
-        trt_promoted_type = promoted_type.to(trt.DataType)
-        input = cast_trt_tensor(ctx, input, trt_promoted_type, f"{name}_input_casted")
-        other = cast_trt_tensor(ctx, other, trt_promoted_type, f"{name}_other_casted")
 
-    layer = ctx.net.add_matrix_multiply(input, input_matrix_op, other, other_matrix_op)
-    set_layer_name(layer, target, name, source_ir)
-    return layer.get_output(0)
+    set_layer_name(matmul_layer, target, name, source_ir)
+    return matmul_output
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -5,7 +5,6 @@
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.utils import is_tegra_platform
 
-from .accumulate_fp32_matmul import accumulate_fp32_matmul
 from .complex_graph_rewrite import complex_graph_detection
 from .constant_folding import constant_fold
 from .fuse_distributed_ops import fuse_distributed_ops
@@ -25,7 +24,6 @@
     fuse_prims_broadcast,
     replace_max_pool_with_indices,
     remove_assert_nodes,
-    accumulate_fp32_matmul,
     remove_num_users_is_0_nodes,
     complex_graph_detection,
 ]
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py b/py/torch_tensorrt/dynamo/lowering/passes/accumulate_fp32_matmul.py
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -368,7 +368,8 @@ def compile(self) -> None:
             enabled_precisions=self.enabled_precisions,
             **self.additional_settings,
         )
-        deallocate_module(self.original_model, delete_module=False)
+        if self.additional_settings.get("offload_module_to_cpu", False):
+            deallocate_module(self.original_model, delete_module=False)
         if self.enable_weight_streaming:
             self.set_weight_streaming_ctx(self.weight_streaming_budget)
 

Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,8 @@ def compile(self) -> None:`
`368`	`368`	`enabled_precisions=self.enabled_precisions,`
`369`	`369`	`**self.additional_settings,`
`370`	`370`	`)`
`371`		`- deallocate_module(self.original_model, delete_module=False)`
	`371`	`+ if self.additional_settings.get("offload_module_to_cpu", False):`
	`372`	`+ deallocate_module(self.original_model, delete_module=False)`
`372`	`373`	`if self.enable_weight_streaming:`
`373`	`374`	`self.set_weight_streaming_ctx(self.weight_streaming_budget)`
`374`	`375`