[a2av] not returning out tensor from ops (pytorch#159435)

kwen2501 · pytorchmergebot · commit 17b9c618ddaa · 2025-07-30T08:30:25.000Z
torch.compile of `all_to_all_vdev_2d` hits the following error: ``` torch._dynamo.exc.BackendCompilerFailed: backend='aot_eager' raised: RuntimeError: Found a custom (non-ATen) operator whose output has alias annotations: symm_mem::all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name, int? major_align=None) -> Tensor(a!). We only support functionalizing operators whose outputs do not have alias annotations (e.g. 'Tensor(a)' is a Tensor with an alias annotation whereas 'Tensor' is a Tensor without. The '(a)' is the alias annotation). The alias annotation specifies that the output Tensor shares storage with an input that has the same annotation. Please check if (1) the output needs to be an output (if not, don't return it), (2) if the output doesn't share storage with any inputs, then delete the alias annotation. (3) if the output indeed shares storage with an input, then add a .clone() before returning it to prevent storage sharing and then delete the alias annotation. Otherwise, please file an issue on GitHub. ``` This PR selects option (1). Pull Request resolved: pytorch#159435 Approved by: https://github.com/ngimel, https://github.com/xmfan
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -342,9 +342,9 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
   m.def(
       "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
   m.def(
-      "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name, int? major_align=None) -> Tensor(a!)");
+      "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name, int? major_align=None) -> ()");
   m.def(
-      "all_to_all_vdev_2d_offset(Tensor input, Tensor(a!) out, Tensor in_splits_offsets, Tensor(a!) out_splits_offsets, str group_name) -> Tensor(a!)");
+      "all_to_all_vdev_2d_offset(Tensor input, Tensor(a!) out, Tensor in_splits_offsets, Tensor(a!) out_splits_offsets, str group_name) -> ()");
 }
 
 TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -539,7 +539,7 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
 #endif
 }
 
-at::Tensor all_to_all_vdev_2d(
+void all_to_all_vdev_2d(
     at::Tensor& input,
     at::Tensor& out,
     at::Tensor& in_splits,
@@ -685,10 +685,9 @@ at::Tensor all_to_all_vdev_2d(
       args1,
       0,
       stream);
-  return out;
 }
 
-at::Tensor all_to_all_vdev_2d_offset(
+void all_to_all_vdev_2d_offset(
     at::Tensor& input,
     at::Tensor& out,
     at::Tensor& in_splits_offsets,
@@ -819,7 +818,6 @@ at::Tensor all_to_all_vdev_2d_offset(
       args1,
       0,
       stream);
-  return out;
 }
 } // namespace c10d::nvshmem_extension
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -38,15 +38,15 @@ at::Tensor all_to_all_vdev(
     at::Tensor& in_out_splits,
     std::string group_name);
 
-at::Tensor all_to_all_vdev_2d(
+void all_to_all_vdev_2d(
     at::Tensor& input,
     at::Tensor& out,
     at::Tensor& in_splits,
     at::Tensor& out_splits_offsets,
     std::string group_name,
     std::optional<int64_t> major_align = std::nullopt);
 
-at::Tensor all_to_all_vdev_2d_offset(
+void all_to_all_vdev_2d_offset(
     at::Tensor& input,
     at::Tensor& out,
     at::Tensor& in_splits_offsets,
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
@@ -1609,6 +1609,29 @@ def _low_contention_reduce_scatter(
         )
 
 
+@torch.library.impl(lib, "all_to_all_vdev_2d", "Meta")
+def _all_to_all_vdev_2d_meta(
+    input: torch.Tensor,
+    out: torch.Tensor,
+    in_splits: torch.Tensor,
+    out_splits_offsets: torch.Tensor,
+    group_name: str,
+    major_align: Optional[int] = None,
+) -> None:
+    return None
+
+
+@torch.library.impl(lib, "all_to_all_vdev_2d_offset", "Meta")
+def _all_to_all_vdev_2d_offset_meta(
+    input: torch.Tensor,
+    out: torch.Tensor,
+    in_splits_offsets: torch.Tensor,
+    out_splits_offsets: torch.Tensor,
+    group_name: str,
+) -> None:
+    return None
+
+
 # =============================================================================
 # User-facing APIs
 # =============================================================================