nv-auto-deploy
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/_graph.py‎
Lines changed: 4 additions & 5 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/_graph.py‎
Lines changed: 4 additions & 5 deletions
@@ -96,24 +96,23 @@ def named_graphmodules(gm: fx.GraphModule) -> Iterator[Tuple[str, fx.GraphModule
             yield name, m
 
 
-def _move_single_gm_to_device(gm: GraphModule, device: torch.device) -> None:
+def _move_single_gm_to_device(
+    gm: GraphModule, device: torch.device, recompile_graph: bool = False
+) -> None:
     """Move one GraphModule and its nodes to the specified device in-place.
     Partially inspired by https://github.com/pytorch/pytorch/blob/05cb98f91d49df9eadfcb3fc29bbd1b621d88860/torch/export/passes/__init__.py#L11
     """
     # move state dict
     gm.to(device)
-    recompile_graph = False
 
     for node in gm.graph.nodes:
         # move all the nodes kwargs with burnt-in device
         if "device" in node.kwargs:
-            recompile_graph = True
             kwargs = node.kwargs.copy()
             kwargs["device"] = device
             node.kwargs = kwargs
 
         if is_op(node, torch.ops.aten.to.device):
-            recompile_graph = True
             args = list(node.args)
             args[1] = device
             node.args = tuple(args)
@@ -136,7 +135,7 @@ def move_to_device(gm: fx.GraphModule, device: DeviceLikeType) -> fx.GraphModule
 
     for _, subgm in reversed(list(named_graphmodules(gm))):
         # recompile graph to update self generated codes in subgraph
-        _move_single_gm_to_device(subgm, device)
+        _move_single_gm_to_device(subgm, device, subgm is not gm)
 
 
 def _is_impure_node(node: Node) -> bool: