sandeepgupta12
diff --git a/‎test/distributed/test_symmetric_memory.py‎
Lines changed: 52 additions & 37 deletions b/‎test/distributed/test_symmetric_memory.py‎
Lines changed: 52 additions & 37 deletions
diff --git a/‎test/dynamo/test_error_messages.py‎
Lines changed: 32 additions & 7 deletions b/‎test/dynamo/test_error_messages.py‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎torch/_dynamo/symbolic_convert.py‎
Lines changed: 11 additions & 0 deletions b/‎torch/_dynamo/symbolic_convert.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎torch/_export/passes/lift_constants_pass.py‎
Lines changed: 9 additions & 4 deletions b/‎torch/_export/passes/lift_constants_pass.py‎
Lines changed: 9 additions & 4 deletions
@@ -1,5 +1,6 @@
 # Owner(s): ["module: c10d"]
 
+import itertools
 import os
 from unittest import skipIf
 
@@ -860,55 +861,69 @@ def test_multimem_one_shot_all_reduce(
 
     @skipIfRocm
     @skip_if_lt_x_gpu(4)
-    @parametrize("dtype", [torch.float, torch.bfloat16])
-    @parametrize("align_bytes", [4, 8, 16])
-    @parametrize("size_bytes", [4, 8192, 8196])
-    def test_one_shot_all_reduce(
-        self, dtype: torch.dtype, size_bytes: int, align_bytes: int
-    ) -> None:
+    def test_one_shot_all_reduce(self) -> None:
         self._init_process()
         group_name = dist.group.WORLD.group_name
 
-        inp = symm_mem.empty(
-            size_bytes // dtype.itemsize, dtype=dtype, device=self.device
-        ).normal_()
-        symm_mem.rendezvous(inp, group=group_name)
-
-        res = torch.ops.symm_mem.one_shot_all_reduce(inp, "sum", group_name)
-        self._verify_all_reduce_result(inp, res)
+        for dtype, size_bytes, align_bytes, copy, offset in itertools.product(
+            [torch.float, torch.bfloat16],
+            [4, 8192, 8196],
+            [4, 8, 16],
+            [True, False],
+            [0, 16],
+        ):
+            inp = symm_mem.empty(
+                size_bytes // dtype.itemsize + offset, dtype=dtype, device=self.device
+            )
+            symm_mem.rendezvous(inp, group=group_name)
+            if not copy:
+                inp.normal_()
+                res = torch.ops.symm_mem.one_shot_all_reduce(
+                    inp[offset:], "sum", group_name
+                )
+            if copy:
+                local_inp = torch.randn_like(inp[offset:])
+                res = torch.ops.symm_mem.one_shot_all_reduce_copy(
+                    inp[offset:], local_inp, "sum", group_name
+                )
+            self._verify_all_reduce_result(local_inp if copy else inp[offset:], res)
 
         dist.destroy_process_group()
 
     @skipIfRocm
     @skip_if_lt_x_gpu(4)
-    @parametrize("dtype", [torch.float, torch.bfloat16])
-    @parametrize("align_bytes", [4, 8, 16])
-    @parametrize("size_bytes", [4, 8192, 8196])
-    def test_two_shot_all_reduce(
-        self, dtype: torch.dtype, size_bytes: int, align_bytes: int
-    ) -> None:
+    def test_two_shot_all_reduce(self) -> None:
         self._init_process()
         group_name = dist.group.WORLD.group_name
 
-        t = symm_mem.empty(16384, dtype=dtype, device=self.device).fill_(0)
-        symm_mem.rendezvous(t, group=group_name)
-
-        self.assertTrue(t.data_ptr() % 16 == 0)
-        self.assertTrue(align_bytes % t.element_size() == 0)
-        self.assertTrue(size_bytes % t.element_size() == 0)
-
-        shift = align_bytes // t.element_size()
-        numel = size_bytes // t.element_size()
-        res = t[shift : shift + numel]
-        res.normal_()
-        inp = res.clone()
-
-        torch.ops.symm_mem.two_shot_all_reduce_(res, "sum", group_name)
+        for dtype, size_bytes, align_bytes, inplace in itertools.product(
+            [torch.float, torch.bfloat16],
+            [4, 8192, 8196],
+            [4, 8, 16],
+            [True, False],
+        ):
+            t = symm_mem.empty(16384, dtype=dtype, device=self.device).fill_(0)
+            symm_mem.rendezvous(t, group=group_name)
+
+            self.assertTrue(t.data_ptr() % 16 == 0)
+            self.assertTrue(align_bytes % t.element_size() == 0)
+            self.assertTrue(size_bytes % t.element_size() == 0)
+
+            shift = align_bytes // t.element_size()
+            numel = size_bytes // t.element_size()
+            res = t[shift : shift + numel]
+            res.normal_().fill_(1)
+            inp = res.clone()
+            if not inplace:
+                out = torch.empty_like(inp)
+                torch.ops.symm_mem.two_shot_all_reduce_out(res, "sum", group_name, out)
+            else:
+                torch.ops.symm_mem.two_shot_all_reduce_(res, "sum", group_name)
 
-        # Head and tail should not be written
-        self.assertTrue(t[:shift].eq(0).all().item())
-        self.assertTrue(t[shift + numel :].eq(0).all().item())
-        self._verify_all_reduce_result(inp, res)
+            # Head and tail should not be written
+            self.assertTrue(t[:shift].eq(0).all().item())
+            self.assertTrue(t[shift + numel :].eq(0).all().item())
+            self._verify_all_reduce_result(inp, res if inplace else out)
 
         dist.destroy_process_group()
 
 
@@ -11,6 +11,7 @@
 import torch._dynamo.test_case
 import torch.utils._pytree as python_pytree
 from torch._dynamo.exc import Unsupported
+from torch._dynamo.testing import skipIfNotPy312
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -646,18 +647,42 @@ def fn():
 """,
         )
 
-    def test_unsupported_bytecode(self):
+    def test_load_build_class(self):
         def fn():
             class Foo:
                 pass
 
             return Foo
 
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
+            """\
+LOAD_BUILD_CLASS bytecode not supported
+  Explanation: Dynamo does not support tracing classes that are defined in the compiled region.
+  Hint: Move the class definition out of the compiled region.
+  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+
+  Developer debug context:
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    class Foo:""",
+        )
+
+    @skipIfNotPy312
+    def test_unsupported_bytecode(self):
+        async def fn():
+            async for i in range(3):
+                print(i)
+            return 1
+
         def post_munge(s):
             s = re.sub(r"0x[0-9A-Fa-f]+", "0xmem_addr", s)
             s = re.sub(
-                r"Instruction\(.*opname='LOAD_BUILD_CLASS'.*\)\n",
-                "Instruction(LOAD_BUILD_CLASS)",
+                r"Instruction\(.*opname='GET_AITER'.*\)\n",
+                "Instruction(GET_AITER)",
                 s,
             )
             return s
@@ -667,15 +692,15 @@ def post_munge(s):
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
 Missing bytecode handler
-  Explanation: Dynamo does not know how to handle the bytecode instruction `LOAD_BUILD_CLASS`.
-  Hint: Do not trace code that produces the `LOAD_BUILD_CLASS` bytecode instruction (see https:/docs.python.org/3/library/dis.html for bytecode semantics).
+  Explanation: Dynamo does not know how to handle the bytecode instruction `GET_AITER`.
+  Hint: Do not trace code that produces the `GET_AITER` bytecode instruction (see https:/docs.python.org/3/library/dis.html for bytecode semantics).
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
-  Developer debug context: LOAD_BUILD_CLASS with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(LOAD_BUILD_CLASS)
+  Developer debug context: GET_AITER with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(GET_AITER)
 
 from user code:
    File "test_error_messages.py", line N, in fn
-    class Foo:""",
+    async for i in range(3):""",
             post_munge=post_munge,
         )
 
 
@@ -2824,6 +2824,17 @@ def MATCH_KEYS(self, inst):
     def LOAD_ASSERTION_ERROR(self, inst):
         self.load_builtin_from_argval("AssertionError")
 
+    def LOAD_BUILD_CLASS(self, inst):
+        unimplemented_v2(
+            gb_type="LOAD_BUILD_CLASS bytecode not supported",
+            context="",
+            explanation="Dynamo does not support tracing classes that are defined in the compiled region.",
+            hints=[
+                "Move the class definition out of the compiled region.",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+
     UNARY_POSITIVE = stack_op(operator.pos)
     UNARY_NEGATIVE = stack_op(operator.neg)
     UNARY_NOT = stack_op(operator.not_)
 
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import collections
-import warnings
+import logging
 from typing import Any, Union
 
 import torch
@@ -19,6 +19,9 @@
 from torch.fx.graph_module import _get_attr
 
 
+log = logging.getLogger(__name__)
+
+
 class ConstantAttrMap(collections.abc.MutableMapping):
     """A mapping class that understands how to use module constants (tensors,
     ScriptObjects, FakeScriptObjects) as keys. We store tensors and FakeScriptObjects normally,
@@ -213,9 +216,11 @@ def lift_constants_pass(
             elif isinstance(constant_val, torch.Tensor):
                 # Remove the parameterness of constant_val
                 if isinstance(constant_val, torch.nn.Parameter):
-                    warnings.warn(
-                        f"{node.target} created when tracing {node.meta.get('stack_trace', '<unknown stack>')} is a parameter. But"
-                        f"it's not registered with register_parameter(). export will treat it as a constant tensor"
+                    log.debug(
+                        "%s created when tracing %s is a parameter. But "
+                        "it's not registered with register_parameter(). export will treat it as a constant tensor",
+                        str(node.target),
+                        str(node.meta.get("stack_trace", "<unknown stack>")),
                     )
                     # We get the real data out of the parameter by disabling the surrounding fake mode.
                     with unset_fake_temporarily():