Use hand written schemas for pytorch op registration to tag mutable arguments (#17)

bnellnm · web-flow · commit 80354e0685b6 · 2025-05-14T16:20:59.000+01:00
The mutable arguments for several ops were not being tagged. I've added schemas to the registration code so these arguments are marked properly. I've also added meta functions so the inductor can run. cc @abcdabcd987 , @nandor , @varun-sundar-rabindranath --------- Signed-off-by: Bill Nell <bnell@redhat.com>
diff --git a/csrc/bindings/all_to_all_ops.cpp b/csrc/bindings/all_to_all_ops.cpp
@@ -195,6 +195,19 @@ void dispatch(
   );
 }
 
+void fake_dispatch(
+    fptr_t ptr,
+    at::Tensor &outExpertNumTokens,
+    at::Tensor &outExpertX,
+    const std::optional<at::Tensor> &outExpertXScale,
+    const at::Tensor &dpX,
+    const std::optional<at::Tensor> &dpXScale,
+    const at::Tensor &indices,
+    const std::optional<at::Tensor> &boundM,
+    bool doSend,
+    bool doRecv
+) {}
+
 template <typename Kernel, typename T, typename U>
 void combineImpl(
     Kernel *all_to_all,
@@ -297,6 +310,17 @@ void combine(
   }
 }
 
+void fake_combine(
+    fptr_t ptr,
+    at::Tensor &outTokens,
+    const at::Tensor &indices,
+    const at::Tensor &weights,
+    const at::Tensor &expertY,
+    const std::optional<at::Tensor> &boundM,
+    bool doSend,
+    bool doRecv
+) {}
+
 #undef _CHECK_TENSOR
 
 } // namespace
@@ -306,11 +330,64 @@ void register_all_to_all_ops(torch::Library &m) {
   m.def("all_to_all_destroy", &destroy);
 
   m.def("all_to_all_internode_create", &create_internode);
-  m.def("all_to_all_internode_dispatch", &dispatch<AllToAllInterNode>);
-  m.def("all_to_all_internode_combine", &combine<AllToAllInterNode>);
+
+  m.def("all_to_all_internode_dispatch("
+        "  int fptr,"
+        "  Tensor! out_expert_num_tokens,"
+        "  Tensor! out_expert_x,"
+        "  Tensor!? out_expert_x_scale,"
+        "  Tensor dp_x,"
+        "  Tensor? dp_x_scale,"
+        "  Tensor indices,"
+        "  Tensor? bound_m,"
+        "  bool do_send,"
+        "  bool do_recv"
+        ") -> ()");
+  m.impl("all_to_all_internode_dispatch", c10::kCUDA, &dispatch<AllToAllInterNode>);
+  m.impl("all_to_all_internode_dispatch", c10::kMeta, &fake_dispatch);
+
+  m.def("all_to_all_internode_combine("
+        "  int fptr,"
+        "  Tensor! out_tokens,"
+        "  Tensor indices,"
+        "  Tensor weights,"
+        "  Tensor expert_y,"
+        "  Tensor? bound_m,"
+        "  bool do_send,"
+        "  bool do_recv"
+        ") -> ()");
+  m.impl("all_to_all_internode_combine", c10::kCUDA, &combine<AllToAllInterNode>);
+  m.impl("all_to_all_internode_combine", c10::kMeta, &fake_combine);
 
   m.def("all_to_all_intranode_create", &create_intranode);
-  m.def("all_to_all_intranode_dispatch", &dispatch<AllToAllIntraNode>);
-  m.def("all_to_all_intranode_combine", &combine<AllToAllIntraNode>);
+
+  m.def("all_to_all_intranode_dispatch("
+        "  int fptr,"
+        "  Tensor! out_expert_num_tokens,"
+        "  Tensor! out_expert_x,"
+        "  Tensor!? out_expert_x_scale,"
+        "  Tensor dp_x,"
+        "  Tensor? dp_x_scale,"
+        "  Tensor indices,"
+        "  Tensor? bound_m,"
+        "  bool do_send,"
+        "  bool do_recv"
+        ") -> ()");
+  m.impl("all_to_all_intranode_dispatch", c10::kCUDA, &dispatch<AllToAllIntraNode>);
+  m.impl("all_to_all_intranode_dispatch", c10::kMeta, &fake_dispatch);
+
+  m.def("all_to_all_intranode_combine("
+        "  int fptr,"
+        "  Tensor! out_tokens,"
+        "  Tensor indices,"
+        "  Tensor weights,"
+        "  Tensor expert_y,"
+        "  Tensor? bound_m,"
+        "  bool do_send,"
+        "  bool do_recv"
+        ") -> ()");
+  m.impl("all_to_all_intranode_combine", c10::kCUDA, &combine<AllToAllIntraNode>);
+  m.impl("all_to_all_intranode_combine", c10::kMeta, &fake_combine);
 }
+
 } // namespace pplx
diff --git a/csrc/bindings/nvshmem.cpp b/csrc/bindings/nvshmem.cpp
@@ -79,6 +79,8 @@ void alltoall(at::Tensor dest, at::Tensor source) {
   ));
 }
 
+void fake_alltoall(at::Tensor dest, at::Tensor source) {}
+
 } // namespace
 
 void pplx::register_nvshmem_ops(torch::Library &m) {
@@ -91,5 +93,7 @@ void pplx::register_nvshmem_ops(torch::Library &m) {
   m.def("nvshmem_malloc", &malloc_tensor);
   m.def("nvshmem_barrier_all", &barrier_all);
   m.def("nvshmem_barrier_all_on_current_stream", &barrier_all_on_current_stream);
-  m.def("nvshmem_alltoall", &alltoall);
+  m.def("nvshmem_alltoall(Tensor! dest, Tensor src) -> ()");
+  m.impl("nvshmem_alltoall", c10::kCUDA, &alltoall);
+  m.impl("nvshmem_alltoall", c10::kMeta, &fake_alltoall);
 }
diff --git a/tests/test_all_to_all.py b/tests/test_all_to_all.py
@@ -50,6 +50,7 @@ def _do_test_all_to_all(
     dp_size: int,
     moe: MoEConfig,
     internode: bool,
+    use_compile: bool,
 ) -> None:
     rank = pgi.rank
     local_rank = pgi.local_rank
@@ -173,7 +174,10 @@ def _do_test_all_to_all(
         )
     bound_m = torch.tensor([rank_data.num_tokens], dtype=torch.uint32, device=device)
     logger.debug("[rank=%d] Dispatch", rank)
-    ata.dispatch(
+
+    dispatch = torch.compile(ata.dispatch) if use_compile else ata.dispatch
+
+    dispatch(
         out_expert_num_tokens=expert_num_tokens,
         out_expert_x=expert_x,
         out_expert_x_scale=expert_x_scale,
@@ -184,6 +188,7 @@ def _do_test_all_to_all(
         indices=rank_data.indices.to(device).to(torch.uint32),
         bound_m=bound_m,
     )
+
     torch.cuda.synchronize()
     logger.debug("[rank=%d] Dispatch done", rank)
 
@@ -253,7 +258,10 @@ def _do_test_all_to_all(
     )
 
     logger.debug("[rank=%d] Combine", rank)
-    ata.combine(
+
+    combine = torch.compile(ata.combine) if use_compile else ata.combine
+
+    combine(
         out_tokens=y,
         indices=rank_data.indices.to(device).to(torch.uint32),
         weights=rank_data.weights.to(device),
@@ -285,6 +293,7 @@ def _worker_test_all_to_all(
     out_dtype: str,
     moe_config: MoEConfig,
     internode: bool,
+    use_compile: bool = False,
 ) -> None:
     uid = nvshmem_get_unique_id() if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
     torch.distributed.broadcast(uid, src=0)
@@ -295,7 +304,8 @@ def _worker_test_all_to_all(
         in_dtype=getattr(torch, in_dtype),
         out_dtype=getattr(torch, out_dtype),
     )
-    _do_test_all_to_all(pgi, dp_size, moe_config, internode)
+
+    _do_test_all_to_all(pgi, dp_size, moe_config, internode, use_compile)
 
     nvshmem_finalize()
 
@@ -304,7 +314,10 @@ def _worker_test_all_to_all(
 @pytest.mark.parametrize("in_dtype", ["bfloat16", "float8_e4m3fn", "float16"])
 @pytest.mark.parametrize("out_dtype", ["float16", "bfloat16"])
 @pytest.mark.parametrize("internode", [True, False])
-def test_all_to_all_4_gpu(in_dtype: str, out_dtype: str, internode: bool) -> None:
+@pytest.mark.parametrize("use_compile", [False, True])
+def test_all_to_all_4_gpu(
+    in_dtype: str, out_dtype: str, internode: bool, use_compile: bool
+) -> None:
     world_size = 4
     dp_size = 2
     parallel_launch(
@@ -315,6 +328,7 @@ def test_all_to_all_4_gpu(in_dtype: str, out_dtype: str, internode: bool) -> Non
         out_dtype,
         small_moe,
         internode,
+        use_compile,
     )