[invoke_subgraph] Don't run the graph twice when autograd enabled (pytorch#167245)

angelayi · pytorchmergebot · commit 789240bae27c · 2025-11-19T06:53:36.000Z
In the [previous PR](https://github.com/pytorch/pytorch/pull/167231/files#diff-e2b74af5d8b538a7d07d18507d27010703742ddad5f819992b55f5abc6d9a502R964-R966) we found that the autograd eager impl of invoke_subgraph calls the subgraph twice. If the subgraph contains effects then effects will be run twice, which is bad. This PR fixes the issue by getting the output metadata from `subgraph`'s `node.meta` if it exists. Differential Revision: [D87392740](https://our.internmc.facebook.com/intern/diff/D87392740) Pull Request resolved: pytorch#167245 Approved by: https://github.com/anijain2305 ghstack dependencies: pytorch#167231
diff --git a/test/higher_order_ops/test_with_effects.py b/test/higher_order_ops/test_with_effects.py
@@ -960,11 +960,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1):
         )
 
         recorded_list.clear()
-        # TODO: seems like invoke_subgraph's py_autograd impl calls the subgraph
-        # eagerly twice. Once for get_output_metadata and then once for
-        # InvokeSubgraphAutogradOp. This causes record_memory to be called twice.
-        with torch.no_grad():
-            out2 = ep.module()(x)
+        out2 = ep.module()(x)
         self.assertEqual(len(recorded_list), 4)
         self.assertTrue(torch.allclose(model(x)[0], out2[0]))
 
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
@@ -305,6 +305,62 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
 
 
 def get_output_metadata(subgraph, *operands):
+    """
+    Extract metadata about the subgraph outputs WITHOUT executing the subgraph.
+    This avoids running side-effectful operations twice (once here, once in forward).
+    We analyze the graph structure statically to extract metadata.
+    """
+    # Unwrap FunctionalizeCtxWrapper if present
+    if isinstance(subgraph, FunctionalizeCtxWrapper):
+        subgraph = subgraph.subgraph
+
+    # If not a GraphModule, fall back to execution-based metadata extraction
+    if not isinstance(subgraph, torch.fx.GraphModule):
+        return _get_output_metadata_by_execution(subgraph, *operands)
+
+    output_metadata = OutputMetadata()
+
+    # Extract output arguments from the output node
+    # The output node has args=(output_values,) where output_values is a tuple/list
+    output_node = next(reversed(subgraph.graph.find_nodes(op="output")))
+    output_metadata.num_fw_outs = len(output_node.args[0])
+
+    for idx, output_arg in enumerate(output_node.args[0]):
+        if not isinstance(output_arg, torch.fx.Node):
+            if isinstance(output_arg, int):
+                output_metadata.indexes_with_symint.add(idx)
+            output_metadata.indexes_with_no_grad.add(idx)
+            continue
+
+        # Check node metadata for type information
+        if output_arg.meta.get("val") is None:
+            # If we don't have complete metadata for all outputs, fall back to execution
+            # This is important for correctness (e.g., detecting SymInts) even though it
+            # runs side-effectful operations
+            return _get_output_metadata_by_execution(subgraph, *operands)
+
+        val = output_arg.meta["val"]
+        if isinstance(val, torch.SymInt):
+            output_metadata.indexes_with_symint.add(idx)
+            output_metadata.indexes_with_no_grad.add(idx)
+        elif isinstance(val, torch.Tensor):
+            # Check if tensor requires grad from metadata
+            if hasattr(val, "requires_grad") and not val.requires_grad:
+                output_metadata.indexes_with_no_grad.add(idx)
+        else:
+            # Non-tensor, non-symint (shouldn't happen but be safe)
+            output_metadata.indexes_with_no_grad.add(idx)
+
+    return output_metadata
+
+
+def _get_output_metadata_by_execution(subgraph, *operands):
+    """
+    Fallback: Extract metadata by executing the subgraph.
+    This should only be used when static analysis fails.
+    WARNING: This will run side-effectful operations!
+    """
+
     with suspend_functionalization(), disable_functional_mode():
         with disable_proxy_modes_tracing():
             # args are functional tensors, generate some example tensors
@@ -324,19 +380,15 @@ def get_output_metadata(subgraph, *operands):
 
             num_fw_outs = len(fw_outs)
 
-            # Collect the indexes of none in the output to check that the grad
-            # is None at the corresponding index in the backward. This check is
-            # performed in the autograd.Function - InvokeSubgraphAutogradOp.
-            # Also collect the indexes of no_grad in the output to filter out
-            # the grad_outs in the `backward` method.
             output_metadata = OutputMetadata()
-
             output_metadata.num_fw_outs = num_fw_outs
+
             for idx, fw_out in enumerate(fw_outs):
                 if isinstance(fw_out, torch.SymInt):
                     output_metadata.indexes_with_symint.add(idx)
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
+
             return output_metadata