[async-TP] Turn asserts back into silent skips (pytorch#158736)

pytorchbot · lw · web-flow · commit 1d51e69f5e5d · 2025-07-21T14:17:27.000-07:00
[async-TP] Turn asserts back into silent skips (pytorch#158572) pytorch#149946 modified some checks that verify whether async-TP is "applicable" to a given collective operation in a graph. Before, the pattern-mathcing+replacement would just be skipped, but now these are asserts that fail and raise. This is causing concrete issues in some graphs where 2-dimensional device meshes are being used (e.g., TP + CP) but only one dimension has symm-mem enabled. See pytorch#158569. This PR is turning these asserts back into harmless early-exits. Note that this only needed to be done for reduce-scatters, as it was already the case for all-gathers. Pull Request resolved: pytorch#158572 Approved by: https://github.com/danielvegamyhre, https://github.com/atalman (cherry picked from commit fac0be7) Co-authored-by: Luca Wehrstedt <lcw@meta.com>
diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -494,5 +494,55 @@ def test_dtensor_seq_par(self, shard_dim: int):
         self.assertNotIn("reduce_scatter_tensor", code)
 
 
+@instantiate_parametrized_tests
+class MicroPipelineTP4GPUTest(TestCase):
+    def setUp(self):
+        torch._inductor.config._micro_pipeline_tp = True
+
+        self.rank = 0
+        self.world_size = 4
+        torch.cuda.set_device("cuda:0")
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+    def tearDown(self):
+        dist.destroy_process_group()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+    def test_extra_collectives(self):
+        device_mesh = DeviceMesh(
+            "cuda",
+            torch.arange(0, self.world_size).view(2, -1),
+            mesh_dim_names=("tp", "other"),
+        )
+
+        def func(inp: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> torch.Tensor:
+            hidden = all_gather_tensor(inp, 0, (device_mesh, 0)) @ w1.t()
+            full_hidden = all_gather_tensor(hidden, 0, (device_mesh, 1))
+            full_hidden /= full_hidden.pow(2).sum().sqrt()
+            hidden = reduce_scatter_tensor(full_hidden, "avg", 0, (device_mesh, 1))
+            return reduce_scatter_tensor(hidden @ w2.t(), "avg", 0, (device_mesh, 0))
+
+        inp = torch.rand(8, 10, device="cuda")
+        w1 = torch.rand(7, 10, device="cuda")
+        w2 = torch.rand(10, 7, device="cuda")
+
+        with _test_mode(group_names={device_mesh["tp"].get_group().group_name}):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inp, w1, w2)
+
+        self.assertIn("fused_all_gather_matmul", code)
+        self.assertIn("all_gather_into_tensor", code)
+        self.assertIn("fused_matmul_reduce_scatter", code)
+        self.assertIn("reduce_scatter_tensor", code)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@@ -850,9 +850,11 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
 
     Returns boolean indicating if fusion was successful or not.
     """
-    assert torch.distributed.is_available() and torch.distributed.is_nccl_available(), (
-        "torch.distributed and NCCL must be available to use async tensor parallelism"
-    )
+    if (
+        not torch.distributed.is_available()
+        or not torch.distributed.is_nccl_available()
+    ):
+        return
 
     from torch.distributed._symmetric_memory import (
         is_symm_mem_enabled_for_group,
@@ -875,9 +877,8 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         reduce_scatter.group_name,
     )
 
-    assert is_symm_mem_enabled_for_group(group_name), (
-        f"symmetric memory is not enabled for process group {group_name}, this is required for async TP"
-    )
+    if not is_symm_mem_enabled_for_group(group_name):
+        return
 
     # Currently fused_matmul_reduce_scatter doesn't return the matmul result,
     # so we can't apply the fusion if the matmul result is used by multiple
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
@@ -47,23 +47,28 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
+_mocked_group_names: Optional[set[str]] = None
 
 
 @contextmanager
-def _test_mode() -> Generator[None, None, None]:
+def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
 
     The context manager is not thread safe.
     """
     global _is_test_mode
+    global _mocked_group_names
     prev = _is_test_mode
+    prev_group_names = _mocked_group_names
     try:
         _is_test_mode = True
+        _mocked_group_names = group_names
         yield
     finally:
         _is_test_mode = prev
+        _mocked_group_names = prev_group_names
 
 
 def is_symm_mem_enabled_for_group(group_name: str) -> bool:
@@ -73,7 +78,9 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     Args:
         group_name (str): the name of the process group.
     """
-    return _is_test_mode or group_name in _group_name_to_store
+    if _is_test_mode:
+        return _mocked_group_names is None or group_name in _mocked_group_names
+    return group_name in _group_name_to_store
 
 
 _group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}