hg mv rdma.py to monarch/rdma (#1381)

dstaay-fb · facebook-github-bot · commit 01cd8333a943 · 2025-09-30T17:12:11.000-07:00
Summary: Pull Request resolved: #1381 As per user discussions, the import path is confusing. Move of import path from monarch/tensor_engine to monarch/rdma. Resubmit of prior diff w/ proper edit history mechanics. Additional minor changes - rename is_available() to is_rdma_available() - some linter fixes - update reference example (grpo_actor) to use updated spawn behavior. Reviewed By: zdevito Differential Revision: D83583220 fbshipit-source-id: 5d54a07d8624ae9ae0dfd238e8a292649f1fd96e
diff --git a/docs/source/examples/getting_started.py b/docs/source/examples/getting_started.py
@@ -293,7 +293,7 @@ def __supervise__(self, event):
 # once an actor has a handle to a buffer, it can read or write to the buffer without the owner of the buffer.
 
 import torch
-from monarch.tensor_engine import RDMABuffer
+from monarch.rdma import RDMABuffer
 
 
 class ParameterServer(Actor):
@@ -347,8 +347,6 @@ def sync_weights(self, server: ParameterServer):
 #
 # We can use distributed features by 'activating' a ProcMesh:
 
-import torch
-
 with trainer_procs.activate():
     t = torch.rand(3, 4)
 print(t)
diff --git a/docs/source/examples/grpo_actor.py b/docs/source/examples/grpo_actor.py
@@ -37,7 +37,7 @@
 import torch.nn as nn
 import torch.optim as optim
 from monarch.actor import Actor, endpoint, this_host
-from monarch.tensor_engine import RDMABuffer
+from monarch.rdma import RDMABuffer
 from torch.distributions import Categorical, kl_divergence
 
 # %%
@@ -503,14 +503,14 @@ async def main():
     gen_mesh = this_host().spawn_procs(per_host={"gpus": 2})
 
     # Spawn actors on the learner mesh
-    traj_q = await learner_mesh.spawn("traj", TrajectoryQueue)
-    replay_buf = await learner_mesh.spawn("rb", ReplayBuffer)
-    learner = await learner_mesh.spawn("learner", Learner, replay_buf)
-    scorer = await learner_mesh.spawn("scorer", Scorer, traj_q, replay_buf)
+    traj_q = learner_mesh.spawn("traj", TrajectoryQueue)
+    replay_buf = learner_mesh.spawn("rb", ReplayBuffer)
+    learner = learner_mesh.spawn("learner", Learner, replay_buf)
+    scorer = learner_mesh.spawn("scorer", Scorer, traj_q, replay_buf)
 
     # Get weight buffers and spawn generators on the generator mesh
     wb = await learner.weights_handle.call_one()
-    generators = await gen_mesh.spawn(
+    generators = gen_mesh.spawn(
         "generator",
         Generator,
         wb,
@@ -531,10 +531,12 @@ async def main():
             learner.step.call_one(),
         )
         print(f"[Step {step:02d}] loss={loss:.3f}")
-    # Clean up
+    # Clean up - stop the scorer and wait for background task to complete
+    print("🛑 Stopping scorer...")
     await scorer.stop.call_one()
     await scorer_run_future
-    print("✅ done")
+
+    print("✅ Training complete")
 
 
 # %%
diff --git a/docs/source/rdma.py b/docs/source/rdma.py
@@ -15,7 +15,7 @@
 
 import torch
 from monarch.actor import Actor, endpoint, this_host
-from monarch.tensor_engine import RDMABuffer
+from monarch.rdma import RDMABuffer
 
 # %%
 # Point-to-Point RDMA
diff --git a/python/monarch/_src/rdma/__init__.py b/python/monarch/_src/rdma/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
diff --git a/python/monarch/_src/rdma/rdma.py b/python/monarch/_src/rdma/rdma.py
@@ -43,7 +43,7 @@ class RDMAWriteTransferWarning(Warning):
 warnings.simplefilter("once", RDMAWriteTransferWarning)
 
 
-def is_available():
+def is_rdma_available():
     return _RdmaBuffer.rdma_supported()
 
 
@@ -210,7 +210,7 @@ def __init__(
             _check_cuda_expandable_segments_enabled()
 
         assert (
-            is_available()
+            is_rdma_available()
         ), "Tried to create an RDMABuffer, but RDMA is not available on this platform."
 
         # We need to ensure that _RdmaManager is initialized at this point, because under the hood
diff --git a/python/monarch/rdma/__init__.py b/python/monarch/rdma/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+"""
+Monarch RDMA API - Public interface for RDMA functionality.
+"""
+
+from monarch._src.rdma.rdma import (
+    is_rdma_available,
+    RDMABuffer,
+    RDMAReadTransferWarning,
+    RDMAWriteTransferWarning,
+)
+
+__all__ = [
+    "is_rdma_available",
+    "RDMABuffer",
+    "RDMAReadTransferWarning",
+    "RDMAWriteTransferWarning",
+]
diff --git a/python/monarch/tensor_engine/__init__.py b/python/monarch/tensor_engine/__init__.py
@@ -6,20 +6,7 @@
 
 # pyre-unsafe
 
-"""
-Monarch Tensor Engine API - Public interface for tensor engine functionality.
-"""
+# Currently empty - RDMA has moved to monarch.rdma
+# Future tensor engine functionality will be added here
 
-from monarch._src.tensor_engine.rdma import (
-    is_available,
-    RDMABuffer,
-    RDMAReadTransferWarning,
-    RDMAWriteTransferWarning,
-)
-
-__all__ = [
-    "is_available",
-    "RDMABuffer",
-    "RDMAReadTransferWarning",
-    "RDMAWriteTransferWarning",
-]
+__all__ = []
diff --git a/python/tests/rdma_load_test.py b/python/tests/rdma_load_test.py
@@ -61,7 +61,7 @@
 # pyre-ignore
 import torch
 from monarch.actor import Actor, endpoint, this_host
-from monarch.tensor_engine import RDMABuffer
+from monarch.rdma import RDMABuffer
 
 
 class RDMATest(Actor):
diff --git a/python/tests/test_rdma.py b/python/tests/test_rdma.py
@@ -13,15 +13,15 @@
 import pytest
 import torch
 from monarch.actor import Actor, current_rank, endpoint, this_host
-from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
+from monarch.rdma import is_rdma_available, RDMABuffer
 
 
 needs_cuda = pytest.mark.skipif(
     not torch.cuda.is_available(),
     reason="CUDA not available",
 )
 needs_rdma = pytest.mark.skipif(
-    not rdma_available(),
+    not is_rdma_available(),
     reason="RDMA not available",
 )
 
diff --git a/python/tests/test_rdma_unit.py b/python/tests/test_rdma_unit.py
@@ -6,7 +6,7 @@
 
 # pyre-unsafe
 """
-Unit tests for python/monarch/_src/tensor_engine/rdma.py
+Unit tests for python/monarch/_src/rdma/rdma.py
 
 RDMA Testing Architecture - Dataflow Summary
 ===========================================
@@ -87,7 +87,7 @@ async def test_new_operation():
 
 import torch
 from monarch.actor import Actor, endpoint, this_host
-from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
+from monarch.rdma import is_rdma_available, RDMABuffer
 
 TIMEOUT = 60  # 60 seconds
 
@@ -110,7 +110,7 @@ def _get_temp_root():
     reason="CUDA not available",
 )
 needs_rdma = pytest.mark.skipif(
-    not rdma_available(),
+    not is_rdma_available(),
     reason="RDMA not available",
 )
 
diff --git a/python/tests/test_rdma_unsupported.py b/python/tests/test_rdma_unsupported.py
@@ -14,10 +14,10 @@
 """
 
 import pytest
-from monarch.tensor_engine import is_available as rdma_available
+from monarch.rdma import is_rdma_available
 
 needs_no_rdma = pytest.mark.skipif(
-    rdma_available(),
+    is_rdma_available(),
     reason="RDMA is available, test only runs on systems without RDMA support",
 )