Export should use aot_export_joint_with_descriptors (pytorch#165931)

tugsbayasgalan · pytorchmergebot · commit 6096c0fc746d · 2025-10-27T19:33:33.000Z
This diff moves export run_decompositions to use aot_export_joint_with_descriptors instead of aot_export_module. Doing so, i ran into 2 main bugs: 1) aot_export_joint_with_descriptors don't correctly pass in record_nn_module_stack flag that is needed to populate nn_module_stack by switching the internal tracer. 2) When creating symint with negative inputs, we need to pass in positive=False. This didn't matter before because aot_autograd directly returns integer inputs instead of creating symint. Pull Request resolved: pytorch#165931 Approved by: https://github.com/zhxchen17
diff --git a/test/export/test_export.py b/test/export/test_export.py
@@ -13910,16 +13910,28 @@ def forward(self, x):
         inps = (torch.ones(5),)
 
         ep = torch.export.export(M(), inps).run_decompositions({})
-        self.assertExpectedInline(
-            str(ep.graph_module.code.strip()),
-            """\
+        if IS_FBCODE:
+            self.assertExpectedInline(
+                str(ep.graph_module.code.strip()),
+                """\
 def forward(self, x):
     cos = torch.ops.aten.cos.default(x)
     auto_functionalized = torch.ops.higher_order.auto_functionalized(torch.ops.testlib.foo.default, x = x, z = cos);  x = cos = None
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, getitem_3, cos_1)""",
-        )
+            )
+        else:
+            self.assertExpectedInline(
+                str(ep.graph_module.code.strip()),
+                """\
+def forward(self, x):
+    cos = torch.ops.aten.cos.default(x)
+    auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.testlib.foo.default, _x_base_index = 0, _z_base_index = 1, _all_bases = [x, cos]);  x = cos = None
+    getitem_3 = auto_functionalized_v2[3];  auto_functionalized_v2 = None
+    cos_1 = torch.ops.aten.cos.default(getitem_3)
+    return (getitem_3, getitem_3, cos_1)""",
+            )
 
     def test_custom_op_auto_warn_pre_dispatch(self):
         class M(torch.nn.Module):
@@ -13932,17 +13944,30 @@ def forward(self, x):
         inps = (torch.ones(5),)
 
         ep = torch.export.export(M(), inps).run_decompositions()
-        self.assertExpectedInline(
-            str(ep.graph_module.code.strip()),
-            """\
+        if IS_FBCODE:
+            self.assertExpectedInline(
+                str(ep.graph_module.code.strip()),
+                """\
 def forward(self, x):
     cos = torch.ops.aten.cos.default(x)
     cos_1 = torch.ops.aten.cos.default(x);  x = None
     auto_functionalized = torch.ops.higher_order.auto_functionalized(torch.ops.testlib.foo.default, x = cos, z = cos_1);  cos = cos_1 = None
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_2 = torch.ops.aten.cos.default(getitem_3);  getitem_3 = None
     return (cos_2,)""",
-        )
+            )
+        else:
+            self.assertExpectedInline(
+                str(ep.graph_module.code.strip()),
+                """\
+def forward(self, x):
+    cos = torch.ops.aten.cos.default(x)
+    cos_1 = torch.ops.aten.cos.default(x);  x = None
+    auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.testlib.foo.default, _x_base_index = 0, _z_base_index = 1, _all_bases = [cos, cos_1]);  cos = cos_1 = None
+    getitem_3 = auto_functionalized_v2[3];  auto_functionalized_v2 = None
+    cos_2 = torch.ops.aten.cos.default(getitem_3);  getitem_3 = None
+    return (cos_2,)""",
+            )
 
         ep = torch.export._trace._export(M(), inps, pre_dispatch=True)
         self.assertExpectedInline(
@@ -15338,17 +15363,30 @@ def forward(self, x):
             decomp_table,
         )
 
-        self.assertExpectedInline(
-            str(ep.graph_module.code).strip(),
-            """\
+        if IS_FBCODE:
+            self.assertExpectedInline(
+                str(ep.graph_module.code).strip(),
+                """\
 def forward(self, x):
     foo_functional = torch.ops.testlib.foo_functional.default(x);  x = None
     cos = torch.ops.aten.cos.default(foo_functional)
     auto_functionalized = torch.ops.higher_order.auto_functionalized(torch.ops.testlib.foo.default, x = foo_functional, z = cos);  foo_functional = cos = None
     getitem_3 = auto_functionalized[3];  auto_functionalized = None
     cos_1 = torch.ops.aten.cos.default(getitem_3)
     return (getitem_3, cos_1)""",
-        )
+            )
+        else:
+            self.assertExpectedInline(
+                str(ep.graph_module.code).strip(),
+                """\
+def forward(self, x):
+    foo_functional = torch.ops.testlib.foo_functional.default(x);  x = None
+    cos = torch.ops.aten.cos.default(foo_functional)
+    auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.testlib.foo.default, _x_base_index = 0, _z_base_index = 1, _all_bases = [foo_functional, cos]);  foo_functional = cos = None
+    getitem_3 = auto_functionalized_v2[3];  auto_functionalized_v2 = None
+    cos_1 = torch.ops.aten.cos.default(getitem_3)
+    return (getitem_3, cos_1)""",
+            )
 
     def test_run_decompositions_keep_metadata(self):
         """Make sure the metadata is kept after exported program run_decompositions."""
diff --git a/torch/_export/config.py b/torch/_export/config.py
@@ -10,6 +10,7 @@
 import sys
 from typing import Any, TYPE_CHECKING
 
+from torch._environment import is_fbcode
 from torch.utils._config_module import install_config_module
 
 
@@ -27,6 +28,11 @@
 # that we don't know how to proxy, resulting in untracked fake tensors
 error_on_lifted_constant_tensors = True
 
+# enable auto_functionalized_v2 in export
+# We turn this off in fbcode due to downstream users not
+# being ready to handle auto_functionalized_v2.
+enable_auto_functionalized_v2_for_export = not is_fbcode()
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -166,9 +166,6 @@ def run_functionalized_fw_and_collect_metadata(
     # Note: this is guaranteed to be set when running under dynamo
     static_input_indices: Optional[list[int]] = None,
     pre_dispatch: bool = False,
-    # is_export is technically only needed to avoid using functionalization V2
-    # during analysis
-    is_export: bool = False,
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: dict[Tensor, Tensor] = {}
 
@@ -200,7 +197,7 @@ def inner(*flat_args):
 
         # It doesn't matter if we run this under predispatch or not because it is
         # only for figuring out metadata
-        mode = FunctionalTensorMode(_allow_token_discovery=True, export=is_export)
+        mode = FunctionalTensorMode(_allow_token_discovery=True)
         suppress_pending = contextlib.nullcontext()
         fake_mode = detect_fake_mode()
         if fake_mode and (shape_env := fake_mode.shape_env):
diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py
@@ -41,7 +41,7 @@ def convert(idx, x):
                         return x
                     source = ConstantSource(f"sym_{idx}")
                     return shape_env.create_symintnode(
-                        shape_env.create_symbol(x, source),
+                        shape_env.create_symbol(x, source, positive=x >= 0),
                         hint=x,
                         source=source,
                     )
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -573,7 +573,6 @@ def _dup_fake_script_obj(fake_flat_args):
                     keep_input_mutations=aot_config.keep_inference_input_mutations,
                     is_train=needs_autograd,
                     pre_dispatch=aot_config.pre_dispatch,
-                    is_export=aot_config.is_export,
                 )(*_dup_fake_script_obj(fake_flat_args))
 
             req_subclass_dispatch = requires_subclass_dispatch(
@@ -905,6 +904,7 @@ def prepare_aot_module_simplified(
     *,
     force_non_lazy_backward_lowering: bool = False,
     disable_functionalization: bool = False,
+    _record_nn_module_stack: bool = False,
 ):
     if not flatten:
         assert kwargs is None
@@ -931,7 +931,13 @@ def prepare_aot_module_simplified(
     # NB: This doesn't change the in/out convention, except adding the
     # parameters as explicit arguments
     functional_call = create_functional_call(
-        mod, params_buffers_spec, params_len + buffers_len, strict_out_tuple=not flatten
+        mod,
+        params_buffers_spec,
+        params_len + buffers_len,
+        strict_out_tuple=not flatten,
+        # We need this for export to run ModuleStackTracer
+        # instead of PythonKeyTracer
+        store_orig_mod=_record_nn_module_stack,
     )
 
     full_args = [*params_flat, *buffers_flat, *args]
@@ -1175,6 +1181,7 @@ def aot_export_joint_with_descriptors(
     keep_inference_input_mutations=False,
     ignore_shape_env=False,
     disable_functionalization=False,
+    _record_nn_module_stack=False,
 ) -> JointWithDescriptors:
     """
     This API captures the joint graph for an nn.Module.  However, unlike
@@ -1265,6 +1272,7 @@ def aot_export_joint_with_descriptors(
         # context.
         force_non_lazy_backward_lowering=True,
         disable_functionalization=disable_functionalization,
+        _record_nn_module_stack=_record_nn_module_stack,
     )
 
     # TODO: Maybe this should be in create_aot_state?  Not sure, that would
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
@@ -145,7 +145,7 @@ def __new__(cls, elem, mode):
         out.elem = elem
 
         if (
-            not mode.export
+            torch._export.config.enable_auto_functionalized_v2_for_export
             and torch.is_inference_mode_enabled()
             and torch._inductor.config.enable_auto_functionalized_v2
         ):
@@ -449,12 +449,18 @@ def unwrap(x):
         ) and not torch._C._dispatch_has_kernel_for_dispatch_key(
             func.name(), torch._C.DispatchKey.Functionalize
         ):
+            import torch._export.config as export_config
             import torch._inductor.config as inductor_config
 
-            if self.export or not inductor_config.enable_auto_functionalized_v2:
+            if torch.compiler.is_exporting():
+                if export_config.enable_auto_functionalized_v2_for_export:
+                    return do_auto_functionalize_v2(self, func, args, kwargs)
+
                 return do_auto_functionalize(self, func, args, kwargs)
-            else:
+
+            if inductor_config.enable_auto_functionalized_v2:
                 return do_auto_functionalize_v2(self, func, args, kwargs)
+            return do_auto_functionalize(self, func, args, kwargs)
 
         from torch._higher_order_ops.effects import handle_effects, has_effects
 
diff --git a/torch/export/_trace.py b/torch/export/_trace.py

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def convert(idx, x):`
`41`	`41`	`return x`
`42`	`42`	`source = ConstantSource(f"sym_{idx}")`
`43`	`43`	`return shape_env.create_symintnode(`
`44`		`- shape_env.create_symbol(x, source),`
	`44`	`+ shape_env.create_symbol(x, source, positive=x >= 0),`
`45`	`45`	`hint=x,`
`46`	`46`	`source=source,`
`47`	`47`	`)`