[DTensor] Add prim and torch symbol for add (#2581)

kshitij12345 · web-flow · commit f4c47c9c6c53 · 2025-10-06T20:56:14.000+02:00
diff --git a/thunder/clang/__init__.py b/thunder/clang/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from collections.abc import Callable, Sequence
-from functools import partial, reduce
+from functools import partial, reduce, update_wrapper
 from numbers import Number
 from types import EllipsisType, NoneType
 from typing import Any, Union
@@ -10,6 +10,7 @@
 
 from thunder.clang.langctx import register_method
 from thunder.clang.utils import create_maybe_convert_to_dtype_with_prim, _elementwise_unary_wrapper
+import thunder.clang.utils as clang_utils
 from thunder.core import utils
 from thunder.core.baseutils import run_once
 from thunder.core.langctxs import langctx, Languages
@@ -368,33 +369,13 @@ def diagonal(a: TensorLike, offset: int = 0, dim1: int = 0, dim2: int = 1) -> Te
 
 # Expands a to the specified shape, possibly adding new dimensions and expanding
 #   dimensions of length 1 to any length
-@clangop()
-def expand(a: TensorLike, *shape: int) -> TensorLike:
-    shape = utils.extract_shape_from_varargs(shape)
-
-    # TODO: improve this error message with error context
-    utils.check(
-        len(shape) >= len(a.shape),
-        lambda: "expand: the requested shape has too few dimensions!",
-    )
-
-    offset = len(shape) - len(a.shape)
-    shape_ = list(shape)
-    for idx, x in enumerate(a.shape):
-        offset_idx = idx + offset
-        requested_length = shape[offset_idx]
-        utils.check(
-            requested_length == x or x == 1 or requested_length == -1,
-            lambda: f"expand: attempting to expand a dimension of length {x}!",
-        )
-
-        shape_[offset_idx] = requested_length if requested_length != -1 else x
-
-    # At this point shape must be valid
-    # utils.check_valid_shape(shape_)
+expand = clangop()(partial(clang_utils.expand_impl, broadcast_prim=prims.broadcast_in_dim))
+# To preserve the docstring
+update_wrapper(expand, clang_utils.expand_impl)
 
-    # NOTE: Converting shape_ to tuple makes it possible to apply CSE
-    return prims.broadcast_in_dim(a, tuple(shape_), tuple(range(offset, len(a.shape) + offset)))
+maybe_broadcast = clangop()(partial(clang_utils.maybe_broadcast_impl, expand_fn=expand))
+# To preserve the docstring
+update_wrapper(maybe_broadcast, clang_utils.maybe_broadcast_impl)
 
 
 # TODO Resolve the start & end vs. start & stop inconsistencies with our operators (this one is start & end)
@@ -1085,31 +1066,7 @@ def stack(tensors: list[TensorProxy], dim: int):
     return cat(tensors_, dim)
 
 
-@clangop()
-def compute_broadcast_shape(*_shapes):
-    """Computes the common shape with the fewest dimensions that all input shapes can be broadcast to."""
-    shapes = tuple(x for x in filter(lambda x: x is not None, _shapes))
-
-    # Short-circuits if there are no inputs shapes
-    #   This might happen in calls like add(2, 3)
-    if len(shapes) == 0:
-        return None
-
-    common_shape = [
-        1,
-    ] * reduce(max, (len(shape) for shape in shapes))
-
-    for shape in shapes:
-        for idx in range(-1, -1 - len(shape), -1):
-            if common_shape[idx] == 1:
-                common_shape[idx] = shape[idx]
-
-            utils.check(
-                (shape[idx] == 1) or (common_shape[idx] == shape[idx]),
-                lambda: f"Attempting to broadcast a dimension of length {shape[idx]}!",
-            )
-
-    return tuple(common_shape)
+compute_broadcast_shape = clangop()(clang_utils.compute_broadcast_shape)
 
 
 @run_once
@@ -1155,28 +1112,6 @@ def matrix_transpose(a: TensorProxy) -> TensorProxy:
     return transpose(a, permutation)
 
 
-# TODO: add scalar support
-# TODO: review hasattr pattern
-# NOTE: the tensor is not broadcasted if it is a CPU scalar tensor and treat_cpu_scalar_tensors_as_numbers=True
-@clangop()
-def maybe_broadcast(*args, treat_cpu_scalar_tensors_as_numbers=True):
-    """Returns tensors with the same shape, possibly broadcasting inputs to the result shape."""
-
-    # Computes common shape
-    common_shape = compute_broadcast_shape(*map(lambda t: t.shape if hasattr(t, "shape") else None, args))
-
-    def _maybe_broadcast(x, shape):
-        if treat_cpu_scalar_tensors_as_numbers and utils.is_cpu_scalar_tensor(x):
-            return x
-        if hasattr(x, "shape"):
-            if not utils.same_shape(x.shape, common_shape):
-                return expand(x, common_shape)
-
-        return x
-
-    return tuple(_maybe_broadcast(x, common_shape) for x in args)
-
-
 #
 # Elementwise unary operations
 #
diff --git a/thunder/clang/utils.py b/thunder/clang/utils.py
@@ -1,6 +1,7 @@
 from numbers import Number
 from collections.abc import Sequence
 from collections.abc import Callable
+from functools import reduce
 
 from thunder.core import utils
 import thunder.core.dtypes as dtypes
@@ -11,6 +12,8 @@
     TensorProxy,
 )
 
+TensorLike = TensorProxy
+
 
 def create_maybe_convert_to_dtype_with_prim(conversion_prim: Symbol):
     assert isinstance(conversion_prim, Symbol)
@@ -66,3 +69,78 @@ def _elementwise_unary_wrapper(
     result = dtype_conversion_fn(result, result_dtype)
 
     return result
+
+
+def compute_broadcast_shape(*_shapes):
+    """Computes the common shape with the fewest dimensions that all input shapes can be broadcast to."""
+    shapes = tuple(x for x in filter(lambda x: x is not None, _shapes))
+
+    # Short-circuits if there are no inputs shapes
+    #   This might happen in calls like add(2, 3)
+    if len(shapes) == 0:
+        return None
+
+    common_shape = [
+        1,
+    ] * reduce(max, (len(shape) for shape in shapes))
+
+    for shape in shapes:
+        for idx in range(-1, -1 - len(shape), -1):
+            if common_shape[idx] == 1:
+                common_shape[idx] = shape[idx]
+
+            utils.check(
+                (shape[idx] == 1) or (common_shape[idx] == shape[idx]),
+                lambda: f"Attempting to broadcast a dimension of length {shape[idx]}!",
+            )
+
+    return tuple(common_shape)
+
+
+def expand_impl(a: TensorLike, *shape: int, broadcast_prim: Symbol) -> TensorLike:
+    shape = utils.extract_shape_from_varargs(shape)
+
+    # TODO: improve this error message with error context
+    utils.check(
+        len(shape) >= len(a.shape),
+        lambda: "expand: the requested shape has too few dimensions!",
+    )
+
+    offset = len(shape) - len(a.shape)
+    shape_ = list(shape)
+    for idx, x in enumerate(a.shape):
+        offset_idx = idx + offset
+        requested_length = shape[offset_idx]
+        utils.check(
+            requested_length == x or x == 1 or requested_length == -1,
+            lambda: f"expand: attempting to expand a dimension of length {x}!",
+        )
+
+        shape_[offset_idx] = requested_length if requested_length != -1 else x
+
+    # At this point shape must be valid
+    # utils.check_valid_shape(shape_)
+
+    # NOTE: Converting shape_ to tuple makes it possible to apply CSE
+    return broadcast_prim(a, tuple(shape_), tuple(range(offset, len(a.shape) + offset)))
+
+
+# TODO: add scalar support
+# TODO: review hasattr pattern
+# NOTE: the tensor is not broadcasted if it is a CPU scalar tensor and treat_cpu_scalar_tensors_as_numbers=True
+def maybe_broadcast_impl(*args, treat_cpu_scalar_tensors_as_numbers=True, expand_fn: Callable):
+    """Returns tensors with the same shape, possibly broadcasting inputs to the result shape."""
+
+    # Computes common shape
+    common_shape = compute_broadcast_shape(*map(lambda t: t.shape if hasattr(t, "shape") else None, args))
+
+    def _maybe_broadcast(x, shape):
+        if treat_cpu_scalar_tensors_as_numbers and utils.is_cpu_scalar_tensor(x):
+            return x
+        if hasattr(x, "shape"):
+            if not utils.same_shape(x.shape, common_shape):
+                return expand_fn(x, common_shape)
+
+        return x
+
+    return tuple(_maybe_broadcast(x, common_shape) for x in args)
diff --git a/thunder/executors/nvfuserex_impl.py b/thunder/executors/nvfuserex_impl.py
@@ -1772,6 +1772,7 @@ def _add(a: TensorProxy | Number, b: TensorProxy | Number, *, fd: FusionDefiniti
 
 
 register_supported(PrimIDs.ADD, _add, _elementwise_binary_check)
+register_dtensor_supported(DTensorPrimIDs.ADD, _add, _elementwise_binary_check)
 
 
 def atan2(a: TensorProxy | Number, b: TensorProxy | Number, *, fd: FusionDefinition, lc_to_nv_map: dict) -> Any:
diff --git a/thunder/tests/distributed/test_dtensor.py b/thunder/tests/distributed/test_dtensor.py
@@ -40,7 +40,17 @@
 #       to choose between DTensor supported symbol (from `dtensor_torch_and_prims.py`) or the usual `ltorch` symbol.
 #       This is why we need to make sure that the OpInfo uses PyTorch native op as `op` which is passed to thunder.jit.
 class DTensorOpInfo:
-    def __init__(self, *, name, op, torch_reference, supports_grad, sample_inputs, skip_noncontiguous_for_executor=()):
+    def __init__(
+        self,
+        *,
+        name,
+        op,
+        torch_reference,
+        supports_grad,
+        sample_inputs,
+        skip_noncontiguous_for_executor=(),
+        skip_for_executor=(),
+    ):
         self.name = name
         assert "torch" in op.__module__, "OpInfo must use PyTorch native op as `op` which is passed to thunder.jit"
         self.op = op
@@ -54,6 +64,9 @@ def __init__(self, *, name, op, torch_reference, supports_grad, sample_inputs, s
         assert isinstance(skip_noncontiguous_for_executor, tuple), "skip_noncontiguous_for_executor must be a tuple"
         self.skip_noncontiguous_for_executor = skip_noncontiguous_for_executor
 
+        assert isinstance(skip_for_executor, tuple), "skip_for_executor must be a tuple"
+        self.skip_for_executor = skip_for_executor
+
 
 # DTensor supported ops
 dtensor_supported_opinfos = (
@@ -98,6 +111,15 @@ def __init__(self, *, name, op, torch_reference, supports_grad, sample_inputs, s
         # Ref:https://github.com/NVIDIA/Fuser/pull/5124
         skip_noncontiguous_for_executor=("nvfuser",),
     ),
+    DTensorOpInfo(
+        name="add",
+        op=torch.add,
+        torch_reference=torch.add,
+        supports_grad=False,
+        sample_inputs=get_opinfo("add").sample_inputs,
+        # Ref:https://github.com/NVIDIA/Fuser/issues/5314
+        skip_for_executor=("nvfuser",),
+    ),
 )
 
 skip_opinfos = (
@@ -309,6 +331,9 @@ def test_dtensor_opinfo(self, op: OpInfo, executor):
         if op.name in skip_opinfos:
             raise unittest.SkipTest(f"test_dtensor_opinfo: Skipping {op.name} as it is in skip_opinfos")
 
+        if executor in op.skip_for_executor:
+            raise unittest.SkipTest(f"test_dtensor_opinfo: Skipping {op.name} as it is in skip_for_executor")
+
         # NOTE: This test only tests for dtype=torch.float32 and requires_grad=True
         #       not for all dtype which are supported by the operation.
         num_devices = self.world_size
diff --git a/thunder/torch/experimental/dtensor_torch_and_prims.py b/thunder/torch/experimental/dtensor_torch_and_prims.py
@@ -7,7 +7,12 @@
 import thunder.torch as ltorch
 from thunder.core.pytree import tree_flatten
 from thunder import clang
-from thunder.clang.utils import create_maybe_convert_to_dtype_with_prim, _elementwise_unary_wrapper
+from thunder.clang.utils import (
+    create_maybe_convert_to_dtype_with_prim,
+    _elementwise_unary_wrapper,
+    maybe_broadcast_impl,
+    expand_impl,
+)
 from thunder.torch.experimental.dtensor_utils import run_with_fake_tensor
 from thunder.torch.experimental.dtensor_proxy import DTensorProxy, create_dtensor_proxy_from_proxies
 from thunder.torch.langctx import register_method
@@ -33,6 +38,7 @@
 class DTensorPrimIDs(Enum):
     # DTensor-specific primitives
     CHECK_DTENSOR_SPEC_REPR = auto()
+    ADD = auto()
     MUL = auto()
     RESHAPE = auto()
     CONVERT_ELEMENT_TYPE = auto()
@@ -365,6 +371,47 @@ def dtensor_reciprocal(a: TensorLike) -> TensorLike:
     )
 
 
+expand = partial(expand_impl, broadcast_prim=dtensor_broadcast_in_dim_prim)
+maybe_broadcast = partial(maybe_broadcast_impl, expand_fn=expand)
+
+
+def _elementwise_binary_wrapper(a, b, *, prim, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT):
+    computation_dtype, result_dtype = utils.elementwise_type_promotion(a, b, type_promotion_kind=type_promotion_kind)
+
+    a, b = maybe_broadcast(a, b)
+    a, b = maybe_convert_to_dtype(a, computation_dtype), maybe_convert_to_dtype(b, computation_dtype)
+
+    result = prim(a, b)
+    result = maybe_convert_to_dtype(result, result_dtype)
+
+    return result
+
+
+def dtensor_add_meta(a, b):
+    output = run_with_fake_tensor(torch.add, a, b)
+    local_tensor_proxy = TensorProxy(like=a.local_tensor)
+    spec = output._spec
+    spec_proxy = AnyProxy(spec, history=a.history)
+    return create_dtensor_proxy_from_proxies(local_tensor_proxy, spec_proxy, False)
+
+
+dtensor_add_prim = make_prim(DTensorPrimIDs.ADD, "dtensor_add_prim", meta=dtensor_add_meta)
+
+dtensor_add_prim_impl = pytorchex.register_operator("dtensor_add_prim", like=dtensor_add_prim, fn=torch.add)
+
+pytorchex.register_implementation(dtensor_add_prim, dtensor_add_prim_impl)
+
+
+@dtensor_torchsymbol(torch.add, id="dtensor.torch.add")
+def dtensor_add(a: TensorLike, b: TensorLike) -> TensorLike:
+    return _elementwise_binary_wrapper(
+        a,
+        b,
+        prim=dtensor_add_prim,
+        type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    )
+
+
 if LooseVersion(torch.__version__) >= "2.8":
 
     def dtensor_grouped_mm_meta(a, b, offsets):
@@ -394,6 +441,7 @@ def dtensor_grouped_mm(a: TensorLike, b: TensorLike, offsets: TensorLike, *, bia
 
 
 def register_dtensor_torch_and_prims():
+    register_function_for_dtensor(torch.add, ltorch.add, dtensor_add, is_method=True)
     register_function_for_dtensor(torch.mul, ltorch.mul, dtensor_mul, is_method=True)
     register_function_for_dtensor(torch.reshape, ltorch.reshape, dtensor_reshape, is_method=True)
     register_function_for_dtensor(torch.nn.functional.linear, ltorch.linear, dtensor_linear, is_method=False)

Original file line number	Diff line number	Diff line change
`@@ -1772,6 +1772,7 @@ def _add(a: TensorProxy \| Number, b: TensorProxy \| Number, *, fd: FusionDefiniti`
`1772`	`1772`
`1773`	`1773`
`1774`	`1774`	`register_supported(PrimIDs.ADD, _add, _elementwise_binary_check)`
	`1775`	`+register_dtensor_supported(DTensorPrimIDs.ADD, _add, _elementwise_binary_check)`
`1775`	`1776`
`1776`	`1777`
`1777`	`1778`	`def atan2(a: TensorProxy \| Number, b: TensorProxy \| Number, *, fd: FusionDefinition, lc_to_nv_map: dict) -> Any:`