[DTensor] support Replicate -> Partial("avg") + support distribute_tensor with Partial placements (pytorch#168133)

tianyu-l · pytorchmergebot · commit c614128a0c12 · 2025-11-20T06:21:28.000Z
Pull Request resolved: pytorch#168133 Approved by: https://github.com/ezyang
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
@@ -76,7 +76,7 @@ def test_debug_mode_mm(self):
           _c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0)  ->  t$3: f32[8, 32]
           _c10d_functional::wait_tensor(t$3: f32[8, 32])  ->  t$3: f32[8, 32]
       aten::mm(t$4: f32[1, 8], t$3: f32[8, 32])  ->  t$5: f32[1, 32]
-  <method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0))  ->  dt$8: f32[]| P
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0))  ->  dt$8: f32[]| P(sum)
     aten::sum(dt$6: f32[8, 32]| S(0))
       aten::sum(t$5: f32[1, 32])  ->  t$7: f32[]""",
         )
@@ -179,8 +179,8 @@ def test_debug_mode_backward(self):
   <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 8]| S(0))
     aten::sum(dt: f32[8, 8]| S(0))
       aten::sum(t: f32[1, 8])
-  torch._tensor.backward(dt: f32[]| P, gradient=None, retain_graph=None, create_graph=False, inputs=None)
-    aten::ones_like(dt: f32[]| P, pin_memory=False, memory_format=torch.preserve_format)
+  torch._tensor.backward(dt: f32[]| P(sum), gradient=None, retain_graph=None, create_graph=False, inputs=None)
+    aten::ones_like(dt: f32[]| P(sum), pin_memory=False, memory_format=torch.preserve_format)
       aten::ones_like(t: f32[], pin_memory=False, memory_format=torch.preserve_format)
     aten::expand(dt: f32[]| R, [8, 8])
       aten::expand(t: f32[], [8, 8])
@@ -189,9 +189,9 @@ def test_debug_mode_backward(self):
         aten::clone(t: f32[8, 1])
       aten::_to_copy(t: f32[8, 1], dtype=torch.float32, layout=torch.strided, device=cpu)
       redistribute_input(t: f32[8, 8], trace: R->S(0))
-        aten::detach(t: f32[8, 1])
         aten::split.Tensor(t: f32[8, 8], 1)
         aten::clone(t: f32[1, 8])
+        aten::detach(t: f32[8, 1])
       aten::_to_copy(t: f32[1, 8], dtype=torch.float32, layout=torch.strided, device=cpu)
       aten::detach(t: f32[1, 8])""",
         )
@@ -253,50 +253,50 @@ def test_debug_mode_einsum(self):
         self.assertExpectedInline(
             debug_mode.debug_string(),
             """\
-  torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8]| PR, dt: f32[8, 4, 4]| RP)
-    aten::unsqueeze(dt: f32[16, 6, 8]| PR, 3)
+  torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8]| P(sum)R, dt: f32[8, 4, 4]| RP(sum))
+    aten::unsqueeze(dt: f32[16, 6, 8]| P(sum)R, 3)
       aten::unsqueeze(t: f32[16, 6, 8], 3)
-    aten::unsqueeze(dt: f32[16, 6, 8, 1]| PR, 4)
+    aten::unsqueeze(dt: f32[16, 6, 8, 1]| P(sum)R, 4)
       aten::unsqueeze(t: f32[16, 6, 8, 1], 4)
-    aten::permute(dt: f32[16, 6, 8, 1, 1]| PR, [0, 1, 3, 4, 2])
+    aten::permute(dt: f32[16, 6, 8, 1, 1]| P(sum)R, [0, 1, 3, 4, 2])
       aten::permute(t: f32[16, 6, 8, 1, 1], [0, 1, 3, 4, 2])
-    aten::unsqueeze(dt: f32[8, 4, 4]| RP, 3)
+    aten::unsqueeze(dt: f32[8, 4, 4]| RP(sum), 3)
       aten::unsqueeze(t: f32[8, 4, 4], 3)
-    aten::unsqueeze(dt: f32[8, 4, 4, 1]| RP, 4)
+    aten::unsqueeze(dt: f32[8, 4, 4, 1]| RP(sum), 4)
       aten::unsqueeze(t: f32[8, 4, 4, 1], 4)
-    aten::permute(dt: f32[8, 4, 4, 1, 1]| RP, [3, 4, 1, 2, 0])
+    aten::permute(dt: f32[8, 4, 4, 1, 1]| RP(sum), [3, 4, 1, 2, 0])
       aten::permute(t: f32[8, 4, 4, 1, 1], [3, 4, 1, 2, 0])
-    aten::permute(dt: f32[16, 6, 1, 1, 8]| PR, [0, 1, 4, 2, 3])
+    aten::permute(dt: f32[16, 6, 1, 1, 8]| P(sum)R, [0, 1, 4, 2, 3])
       aten::permute(t: f32[16, 6, 1, 1, 8], [0, 1, 4, 2, 3])
-    aten::view(dt: f32[16, 6, 8, 1, 1]| PR, [1, 96, 8])
+    aten::view(dt: f32[16, 6, 8, 1, 1]| P(sum)R, [1, 96, 8])
       aten::view(t: f32[16, 6, 8, 1, 1], [1, 96, 8])
-    aten::permute(dt: f32[1, 1, 4, 4, 8]| RP, [4, 2, 3, 0, 1])
+    aten::permute(dt: f32[1, 1, 4, 4, 8]| RP(sum), [4, 2, 3, 0, 1])
       aten::permute(t: f32[1, 1, 4, 4, 8], [4, 2, 3, 0, 1])
-    aten::view(dt: f32[8, 4, 4, 1, 1]| RP, [1, 8, 16])
+    aten::view(dt: f32[8, 4, 4, 1, 1]| RP(sum), [1, 8, 16])
       aten::view(t: f32[8, 4, 4, 1, 1], [1, 8, 16])
-    aten::bmm(dt: f32[1, 96, 8]| PR, dt: f32[1, 8, 16]| RP)
-      redistribute_input(0, PR -> S(2)[0]S(2)[1])
-        redistribute_input(t: f32[1, 96, 8], trace: PR->S(2)R->S(2)[0]S(2)[1])
+    aten::bmm(dt: f32[1, 96, 8]| P(sum)R, dt: f32[1, 8, 16]| RP(sum))
+      redistribute_input(0, P(sum)R -> S(2)[0]S(2)[1])
+        redistribute_input(t: f32[1, 96, 8], trace: P(sum)R->S(2)R->S(2)[0]S(2)[1])
           aten::chunk(t: f32[1, 96, 8], 4, 2)
           aten::cat(['t: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]'])
           _c10d_functional::reduce_scatter_tensor(t: f32[4, 96, 2], sum, 4, 1)
           _c10d_functional::wait_tensor(t: f32[1, 96, 2])
           aten::chunk(t: f32[1, 96, 2], 2, 2)
           aten::clone(t: f32[1, 96, 1])
-      redistribute_input(1, RP -> S(1)[0]S(1)[1])
-        redistribute_input(t: f32[1, 8, 16], trace: RP->S(1)P->S(1)[0]S(1)[1])
+      redistribute_input(1, RP(sum) -> S(1)[0]S(1)[1])
+        redistribute_input(t: f32[1, 8, 16], trace: RP(sum)->S(1)P(sum)->S(1)[0]S(1)[1])
           aten::chunk(t: f32[1, 8, 16], 4, 1)
           aten::clone(t: f32[1, 2, 16])
           aten::chunk(t: f32[1, 2, 16], 2, 1)
           aten::cat(['t: f32[1, 1, 16]', 't: f32[1, 1, 16]'])
           _c10d_functional::reduce_scatter_tensor(t: f32[2, 1, 16], sum, 2, 3)
           _c10d_functional::wait_tensor(t: f32[1, 1, 16])
       aten::bmm(t: f32[1, 96, 1], t: f32[1, 1, 16])
-    aten::view(dt: f32[1, 96, 16]| PP, [16, 6, 1, 4, 4])
+    aten::view(dt: f32[1, 96, 16]| P(sum)P(sum), [16, 6, 1, 4, 4])
       aten::view(t: f32[1, 96, 16], [16, 6, 1, 4, 4])
-    aten::permute(dt: f32[16, 6, 1, 4, 4]| PP, [0, 1, 3, 4, 2])
+    aten::permute(dt: f32[16, 6, 1, 4, 4]| P(sum)P(sum), [0, 1, 3, 4, 2])
       aten::permute(t: f32[16, 6, 1, 4, 4], [0, 1, 3, 4, 2])
-    aten::view(dt: f32[16, 6, 4, 4, 1]| PP, [16, 6, 4, 4])
+    aten::view(dt: f32[16, 6, 4, 4, 1]| P(sum)P(sum), [16, 6, 4, 4])
       aten::view(t: f32[16, 6, 4, 4, 1], [16, 6, 4, 4])""",
         )
 
diff --git a/test/distributed/tensor/test_api.py b/test/distributed/tensor/test_api.py
@@ -79,7 +79,13 @@ def test_distribute_tensor_rank(self):
         dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_minus_spec)
         self.assertEqual(dist_tensor.placements[0].dim, 1)
 
-        placement_combs = [[Shard(0)], [Shard(1)], [Replicate()]]
+        placement_combs = [
+            [Shard(0)],
+            [Shard(1)],
+            [Replicate()],
+            [Partial(reduce_op="sum")],
+            [Partial(reduce_op="avg")],
+        ]
 
         if not self.is_local_tensor_enabled:
             # test src_data_rank == 1
@@ -125,6 +131,10 @@ def test_distribute_tensor_errors(self):
             shard_spec = [Shard(0)]
             distribute_tensor(tensor_to_distribute, device_mesh, shard_spec)
 
+        with self.assertRaisesRegex(ValueError, "conversion is not supported"):
+            new_spec = [Replicate(), Partial(reduce_op="prod")]
+            distribute_tensor(tensor_to_distribute, device_mesh, new_spec)
+
         with self.assertRaisesRegex(RuntimeError, "distribute leaf tensor"):
             shard_spec = [Shard(0)]
             global_tensor = torch.randn(*tensor_shape, requires_grad=True)
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
@@ -725,7 +725,7 @@ def run_mean(self):
             self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
 
             if is_evenly_shardable:
-                self.assertTrue("P->R" in debug_mode.debug_string())
+                self.assertTrue("P(avg)->R" in debug_mode.debug_string())
             else:
                 self.assertTrue("S(0)->R" in debug_mode.debug_string())
 
diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py
@@ -148,6 +148,30 @@ def test_partial_add(self):
         d_3 = d_1 + d_2
         self.assertTrue(d_3._spec.placements[0].is_partial())
 
+    def test_partial_replicate_add(self):
+        device_mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+
+        for reduce_op in ("sum", "avg"):
+            d_1 = DTensor.from_local(
+                torch.rand(2, 2),
+                device_mesh,
+                [Partial(reduce_op=reduce_op)],
+            )
+            d_2 = DTensor.from_local(
+                torch.rand(2, 1),
+                device_mesh,
+                [Replicate()],
+                run_check=True,
+            )
+
+            with comm_mode:
+                d_3 = d_1 + d_2
+
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(d_3.placements, (Partial(reduce_op=reduce_op),))
+            self.assertEqual(d_3.full_tensor(), d_1.full_tensor() + d_2.full_tensor())
+
     def test_activations(self):
         device_mesh = self.build_device_mesh()
         self._run_sharded_elementwise_ops(
@@ -247,6 +271,7 @@ def test_dropout_backward(self):
             ),
         )
 
+    @skip_unless_torch_gpu
     def test_dropout_errors(self):
         device_mesh = self.build_device_mesh()
         with self.assertRaisesRegex(RuntimeError, "supported"):
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
@@ -818,6 +818,11 @@ def distribute_tensor(
             local_tensor = Replicate._make_replicate_tensor(
                 local_tensor, device_mesh, idx, src_data_rank
             )
+        elif isinstance(placement, Partial):
+            local_tensor = Replicate._make_replicate_tensor(
+                local_tensor, device_mesh, idx, src_data_rank
+            )
+            local_tensor = placement._partition_value(local_tensor, device_mesh, idx)
         else:
             raise RuntimeError(
                 f"Trying to distribute tensor with unsupported placements {placement} on device mesh dimension {idx}!"
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
@@ -163,6 +163,16 @@ def __eq__(self, other: object) -> bool:
     def __hash__(self) -> int:
         return 1 + hash(self.norm_type)
 
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the _NormPartial placement
+        """
+        return f"_NormPartial(reduce_op={self.reduce_op}, norm_type={self.norm_type})"
+
+    def __str__(self) -> str:
+        """human readable representation of the _NormPartial placement"""
+        return f"_NormP({self.reduce_op}, {self.norm_type})"
+
 
 def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[list[int]]:
     if dims_arg is None:
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
@@ -816,14 +816,18 @@ def _partition_value(
         # Partial placement contract #3:
         # _partition_value: partition the value of a replicated tensor on the mesh dimension
 
-        # _partition_value is the conjugate operation of _reduce_value
-        # - i.e. _partition_value on a sum reduce op is just a division operation
-        # - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
-        # TODO: if the reduce_op is min/max, etc. the _partition_value should be a
-        # different operation
-        assert self.reduce_op == "sum", "only support replicate to PartialSUM for now!"
+        # _partition_value is the conjugate operation of _reduce_value, e.g.
+        # - _partition_value on a sum reduce op is just a division operation
+        # - _reduce_value on a sum reduce op would just be a sum(allreduce) operation
         num_chunks = mesh.size(mesh_dim=mesh_dim)
-        return tensor / num_chunks
+        if self.reduce_op == "sum":
+            return tensor / num_chunks
+        elif self.reduce_op in ("avg", "min", "max"):
+            return tensor
+        else:
+            raise ValueError(
+                f"Replicate to Partial({self.reduce_op}) conversion is not supported."
+            )
 
     def __hash__(self) -> int:
         return 1 + hash(self.reduce_op)
@@ -838,7 +842,7 @@ def __str__(self) -> str:
         """
         human readable representation of the Partial placement
         """
-        return "P"
+        return f"P({self.reduce_op})"
 
 
 # We keep the old _Partial name for a while for BC reason
@@ -982,10 +986,10 @@ def __repr__(self) -> str:
         """
         machine readable representation of the MaskPartial placement
         """
-        return f"MaskPartial(offset_shape={self.offset_shape}, offset_dim={self.offset_dim})"
+        return f"MaskPartial(reduce_op={self.reduce_op}, offset_shape={self.offset_shape}, offset_dim={self.offset_dim})"
 
     def __str__(self) -> str:
         """
         human readable representation of the MaskPartial placement
         """
-        return "MaskP"
+        return f"MaskP({self.reduce_op}, {self.offset_shape}, {self.offset_dim})"