Skip to content

Commit aedf526

Browse files
samluryemeta-codesync[bot]
authored andcommitted
Enable rdma tests on v1 (meta-pytorch#1507)
Summary: Pull Request resolved: meta-pytorch#1507 Enable rdma tests on v1. Also ensure proc mesh is initialized in `_ensure_init_rdma_manager` to prevent tokio runtime blocking issue. The tests in `test_rdma_unit.py` actually weren't running *at all*, even for v0, because whoever wrote them used a synchronous function decorator to wrap their async function tests. ghstack-source-id: 315627106 Reviewed By: zdevito, mariusae Differential Revision: D84395240 fbshipit-source-id: 53782e655f691587b4587694f507d13d6356d4a5
1 parent d8d2806 commit aedf526

File tree

7 files changed

+17
-23
lines changed

7 files changed

+17
-23
lines changed

hyperactor_mesh/src/resource.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ pub enum Status {
7272
/// The resource is stopped.
7373
Stopped,
7474
/// The resource has failed, with an error message.
75+
#[strum(to_string = "Failed({0})")]
7576
Failed(String),
7677
/// The resource has been declared failed after a timeout.
78+
#[strum(to_string = "Timeout({0:?})")]
7779
Timeout(Duration),
7880
}
7981

python/monarch/_src/actor/proc_mesh.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,13 @@ def __eq__(self, other: object) -> bool:
161161
def _proc_mesh(self) -> Shared["HyProcMeshV0"]:
162162
return _deref_proc_mesh(self)._proc_mesh
163163

164+
@property
165+
def initialized(self) -> Future[Literal[True]]:
166+
async def task() -> Literal[True]:
167+
return True
168+
169+
return Future(coro=task())
170+
164171

165172
_proc_mesh_lock: threading.Lock = threading.Lock()
166173
_proc_mesh_key: int = 0

python/monarch/_src/rdma/rdma.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ def is_rdma_available():
5555
@functools.cache
5656
def _ensure_init_rdma_manager() -> Shared[None]:
5757
async def task() -> None:
58+
# Ensure the proc mesh is initialized before we can send it over the wire,
59+
# since pickling the proc mesh before it is initiliazed would block the
60+
# tokio runtime and cause a panic.
61+
await context().actor_instance.proc_mesh.initialized
5862
await (
5963
await get_or_spawn_controller("rdma_controller", RdmaController)
6064
).init_rdma_on_mesh.call_one(none_throws(context().actor_instance.proc_mesh))

python/tests/rdma_load_test.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import statistics
1212
import time
1313

14-
import pytest
1514

1615
# parse up front to extract env variables.
1716
args = None
@@ -63,16 +62,10 @@
6362

6463
# pyre-ignore
6564
import torch
66-
from monarch._src.actor.v1 import enabled as v1_enabled
6765
from monarch.actor import Actor, endpoint, this_host
6866
from monarch.rdma import RDMABuffer
6967

7068

71-
pytestmark: pytest.MarkDecorator = pytest.mark.skipif(
72-
v1_enabled, reason="ENABLE ME ASAP ONCE V1 RDMA LANDS"
73-
)
74-
75-
7669
class RDMATest(Actor):
7770
def __init__(
7871
self, device: str = "cpu", operation: str = "write", size_mb: int = 64

python/tests/test_rdma.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,10 @@
1212

1313
import pytest
1414
import torch
15-
from monarch._src.actor.v1 import enabled as v1_enabled
1615
from monarch.actor import Actor, current_rank, endpoint, this_host
1716
from monarch.rdma import is_rdma_available, RDMAAction, RDMABuffer
1817

1918

20-
pytestmark = pytest.mark.skipif(v1_enabled, reason="ENABLE ASAP ONCE V1 RDMA LANDS")
21-
22-
2319
needs_cuda = pytest.mark.skipif(
2420
not torch.cuda.is_available(),
2521
reason="CUDA not available",

python/tests/test_rdma_unit.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,10 @@ async def test_new_operation():
8686
import pytest
8787

8888
import torch
89-
from monarch._src.actor.v1 import enabled as v1_enabled
9089
from monarch.actor import Actor, endpoint, this_host
9190
from monarch.rdma import is_rdma_available, RDMABuffer
9291

9392

94-
pytestmark = pytest.mark.skipif(v1_enabled, reason="ENABLE ASAP ONCE V1 RDMA LANDS")
95-
96-
9793
TIMEOUT = 60 # 60 seconds
9894

9995

@@ -546,8 +542,8 @@ def _test_with_all_data(func):
546542
@pytest.mark.parametrize("controller_device", CONTROLLER_DEVICES)
547543
@pytest.mark.parametrize("receiver_device", RECEIVER_DEVICES)
548544
@pytest.mark.asyncio
549-
def marked(dtype, data_getter, controller_device, receiver_device):
550-
return func(dtype, data_getter, controller_device, receiver_device)
545+
async def marked(dtype, data_getter, controller_device, receiver_device):
546+
return await func(dtype, data_getter, controller_device, receiver_device)
551547

552548
return marked
553549

@@ -560,8 +556,8 @@ def _test_with_no_data(func):
560556
@pytest.mark.parametrize("controller_device", CONTROLLER_DEVICES)
561557
@pytest.mark.parametrize("receiver_device", RECEIVER_DEVICES)
562558
@pytest.mark.asyncio
563-
def marked(dtype, data_getter, controller_device, receiver_device):
564-
return func(dtype, data_getter, controller_device, receiver_device)
559+
async def marked(dtype, data_getter, controller_device, receiver_device):
560+
return await func(dtype, data_getter, controller_device, receiver_device)
565561

566562
return marked
567563

python/tests/test_rdma_unsupported.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,9 @@
1414
"""
1515

1616
import pytest
17-
from monarch._src.actor.v1 import enabled as v1_enabled
1817
from monarch.rdma import is_rdma_available
1918

2019

21-
pytestmark = pytest.mark.skipif(v1_enabled, reason="ENABLE ASAP ONCE V1 RDMA LANDS")
22-
23-
2420
needs_no_rdma = pytest.mark.skipif(
2521
is_rdma_available(),
2622
reason="RDMA is available, test only runs on systems without RDMA support",

0 commit comments

Comments
 (0)