Skip to content

Commit 01cd833

Browse files
dstaay-fbfacebook-github-bot
authored andcommitted
hg mv rdma.py to monarch/rdma (#1381)
Summary: Pull Request resolved: #1381 As per user discussions, the import path is confusing. Move of import path from monarch/tensor_engine to monarch/rdma. Resubmit of prior diff w/ proper edit history mechanics. Additional minor changes - rename is_available() to is_rdma_available() - some linter fixes - update reference example (grpo_actor) to use updated spawn behavior. Reviewed By: zdevito Differential Revision: D83583220 fbshipit-source-id: 5d54a07d8624ae9ae0dfd238e8a292649f1fd96e
1 parent 3e42686 commit 01cd833

File tree

11 files changed

+57
-38
lines changed

11 files changed

+57
-38
lines changed

docs/source/examples/getting_started.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def __supervise__(self, event):
293293
# once an actor has a handle to a buffer, it can read or write to the buffer without the owner of the buffer.
294294

295295
import torch
296-
from monarch.tensor_engine import RDMABuffer
296+
from monarch.rdma import RDMABuffer
297297

298298

299299
class ParameterServer(Actor):
@@ -347,8 +347,6 @@ def sync_weights(self, server: ParameterServer):
347347
#
348348
# We can use distributed features by 'activating' a ProcMesh:
349349

350-
import torch
351-
352350
with trainer_procs.activate():
353351
t = torch.rand(3, 4)
354352
print(t)

docs/source/examples/grpo_actor.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
import torch.nn as nn
3838
import torch.optim as optim
3939
from monarch.actor import Actor, endpoint, this_host
40-
from monarch.tensor_engine import RDMABuffer
40+
from monarch.rdma import RDMABuffer
4141
from torch.distributions import Categorical, kl_divergence
4242

4343
# %%
@@ -503,14 +503,14 @@ async def main():
503503
gen_mesh = this_host().spawn_procs(per_host={"gpus": 2})
504504

505505
# Spawn actors on the learner mesh
506-
traj_q = await learner_mesh.spawn("traj", TrajectoryQueue)
507-
replay_buf = await learner_mesh.spawn("rb", ReplayBuffer)
508-
learner = await learner_mesh.spawn("learner", Learner, replay_buf)
509-
scorer = await learner_mesh.spawn("scorer", Scorer, traj_q, replay_buf)
506+
traj_q = learner_mesh.spawn("traj", TrajectoryQueue)
507+
replay_buf = learner_mesh.spawn("rb", ReplayBuffer)
508+
learner = learner_mesh.spawn("learner", Learner, replay_buf)
509+
scorer = learner_mesh.spawn("scorer", Scorer, traj_q, replay_buf)
510510

511511
# Get weight buffers and spawn generators on the generator mesh
512512
wb = await learner.weights_handle.call_one()
513-
generators = await gen_mesh.spawn(
513+
generators = gen_mesh.spawn(
514514
"generator",
515515
Generator,
516516
wb,
@@ -531,10 +531,12 @@ async def main():
531531
learner.step.call_one(),
532532
)
533533
print(f"[Step {step:02d}] loss={loss:.3f}")
534-
# Clean up
534+
# Clean up - stop the scorer and wait for background task to complete
535+
print("🛑 Stopping scorer...")
535536
await scorer.stop.call_one()
536537
await scorer_run_future
537-
print("✅ done")
538+
539+
print("✅ Training complete")
538540

539541

540542
# %%

docs/source/rdma.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import torch
1717
from monarch.actor import Actor, endpoint, this_host
18-
from monarch.tensor_engine import RDMABuffer
18+
from monarch.rdma import RDMABuffer
1919

2020
# %%
2121
# Point-to-Point RDMA
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# pyre-unsafe

python/monarch/_src/tensor_engine/rdma.py renamed to python/monarch/_src/rdma/rdma.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class RDMAWriteTransferWarning(Warning):
4343
warnings.simplefilter("once", RDMAWriteTransferWarning)
4444

4545

46-
def is_available():
46+
def is_rdma_available():
4747
return _RdmaBuffer.rdma_supported()
4848

4949

@@ -210,7 +210,7 @@ def __init__(
210210
_check_cuda_expandable_segments_enabled()
211211

212212
assert (
213-
is_available()
213+
is_rdma_available()
214214
), "Tried to create an RDMABuffer, but RDMA is not available on this platform."
215215

216216
# We need to ensure that _RdmaManager is initialized at this point, because under the hood

python/monarch/rdma/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# pyre-unsafe
8+
9+
"""
10+
Monarch RDMA API - Public interface for RDMA functionality.
11+
"""
12+
13+
from monarch._src.rdma.rdma import (
14+
is_rdma_available,
15+
RDMABuffer,
16+
RDMAReadTransferWarning,
17+
RDMAWriteTransferWarning,
18+
)
19+
20+
__all__ = [
21+
"is_rdma_available",
22+
"RDMABuffer",
23+
"RDMAReadTransferWarning",
24+
"RDMAWriteTransferWarning",
25+
]

python/monarch/tensor_engine/__init__.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,7 @@
66

77
# pyre-unsafe
88

9-
"""
10-
Monarch Tensor Engine API - Public interface for tensor engine functionality.
11-
"""
9+
# Currently empty - RDMA has moved to monarch.rdma
10+
# Future tensor engine functionality will be added here
1211

13-
from monarch._src.tensor_engine.rdma import (
14-
is_available,
15-
RDMABuffer,
16-
RDMAReadTransferWarning,
17-
RDMAWriteTransferWarning,
18-
)
19-
20-
__all__ = [
21-
"is_available",
22-
"RDMABuffer",
23-
"RDMAReadTransferWarning",
24-
"RDMAWriteTransferWarning",
25-
]
12+
__all__ = []

python/tests/rdma_load_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
# pyre-ignore
6262
import torch
6363
from monarch.actor import Actor, endpoint, this_host
64-
from monarch.tensor_engine import RDMABuffer
64+
from monarch.rdma import RDMABuffer
6565

6666

6767
class RDMATest(Actor):

python/tests/test_rdma.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
import pytest
1414
import torch
1515
from monarch.actor import Actor, current_rank, endpoint, this_host
16-
from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
16+
from monarch.rdma import is_rdma_available, RDMABuffer
1717

1818

1919
needs_cuda = pytest.mark.skipif(
2020
not torch.cuda.is_available(),
2121
reason="CUDA not available",
2222
)
2323
needs_rdma = pytest.mark.skipif(
24-
not rdma_available(),
24+
not is_rdma_available(),
2525
reason="RDMA not available",
2626
)
2727

python/tests/test_rdma_unit.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
# pyre-unsafe
88
"""
9-
Unit tests for python/monarch/_src/tensor_engine/rdma.py
9+
Unit tests for python/monarch/_src/rdma/rdma.py
1010
1111
RDMA Testing Architecture - Dataflow Summary
1212
===========================================
@@ -87,7 +87,7 @@ async def test_new_operation():
8787

8888
import torch
8989
from monarch.actor import Actor, endpoint, this_host
90-
from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
90+
from monarch.rdma import is_rdma_available, RDMABuffer
9191

9292
TIMEOUT = 60 # 60 seconds
9393

@@ -110,7 +110,7 @@ def _get_temp_root():
110110
reason="CUDA not available",
111111
)
112112
needs_rdma = pytest.mark.skipif(
113-
not rdma_available(),
113+
not is_rdma_available(),
114114
reason="RDMA not available",
115115
)
116116

0 commit comments

Comments
 (0)