Enable log forwarding for v1 API (#1375)

samlurye · facebook-github-bot · commit 7e52cb11d116 · 2025-09-30T16:45:32.000-07:00
Summary: Pull Request resolved: #1375 Use logging manager to spawn log client/log forwarders on v1 proc meshes ghstack-source-id: 313150889 Reviewed By: mariusae Differential Revision: D83360166 fbshipit-source-id: 456673311006e001569c4b02923ee881f970e3d9
diff --git a/monarch_hyperactor/src/v1/logging.rs b/monarch_hyperactor/src/v1/logging.rs
@@ -17,6 +17,7 @@ use hyperactor_mesh::logging::LogClientMessage;
 use hyperactor_mesh::logging::LogForwardActor;
 use hyperactor_mesh::logging::LogForwardMessage;
 use hyperactor_mesh::v1::ActorMesh;
+use hyperactor_mesh::v1::Name;
 use hyperactor_mesh::v1::actor_mesh::ActorMeshRef;
 use ndslice::View;
 use pyo3::Bound;
@@ -80,7 +81,10 @@ impl LoggingMeshClient {
         PyPythonTask::new(async move {
             let client_actor: ActorHandle<LogClientActor> =
                 instance_dispatch!(instance, async move |cx_instance| {
-                    cx_instance.proc().spawn("log_client", ()).await
+                    cx_instance
+                        .proc()
+                        .spawn(&Name::new("log_client").to_string(), ())
+                        .await
                 })?;
             let client_actor_ref = client_actor.bind();
             let forwarder_mesh = instance_dispatch!(instance, async |cx_instance| {
diff --git a/python/monarch/_src/actor/logging.py b/python/monarch/_src/actor/logging.py
@@ -50,7 +50,8 @@ def flush_all_proc_mesh_logs(v1: bool = False) -> None:
         from monarch._src.actor.v1.proc_mesh import get_active_proc_meshes
 
     for pm in get_active_proc_meshes():
-        pm._logging_manager.flush()
+        if pm._logging_manager._logging_mesh_client is not None:
+            pm._logging_manager.flush()
 
 
 class LoggingManager:
diff --git a/python/monarch/_src/actor/v1/proc_mesh.py b/python/monarch/_src/actor/v1/proc_mesh.py
@@ -200,8 +200,7 @@ async def task(
         ) -> HyProcMesh:
             hy_proc_mesh = await hy_proc_mesh_task
 
-            # FIXME: Fix log forwarding.
-            # await pm._logging_manager.init(hy_proc_mesh, stream_log_to_client)
+            await pm._logging_manager.init(hy_proc_mesh, stream_log_to_client)
 
             if setup_actor is not None:
                 await setup_actor.setup.call()
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
@@ -37,17 +37,19 @@
 )
 from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
+from monarch._rust_bindings.monarch_hyperactor.shape import Extent
 
 from monarch._src.actor.actor_mesh import ActorMesh, Channel, context, Port
-from monarch._src.actor.allocator import AllocHandle
+from monarch._src.actor.allocator import AllocHandle, ProcessAllocator
 from monarch._src.actor.future import Future
 from monarch._src.actor.host_mesh import (
     create_local_host_mesh,
     fake_in_process_host,
     HostMesh,
 )
-from monarch._src.actor.proc_mesh import ProcMesh
+from monarch._src.actor.proc_mesh import _get_bootstrap_args, ProcMesh
 from monarch._src.actor.v1.host_mesh import (
+    _bootstrap_cmd,
     fake_in_process_host as fake_in_process_host_v1,
     HostMesh as HostMeshV1,
     this_host as this_host_v1,
@@ -466,7 +468,7 @@ async def no_more(self) -> None:
 
 
 @pytest.mark.parametrize("v1", [True, False])
-@pytest.mark.timeout(30)
+@pytest.mark.timeout(60)
 async def test_async_concurrency(v1: bool):
     """Test that async endpoints will be processed concurrently."""
     pm = spawn_procs_on_this_host(v1, {})
@@ -603,8 +605,9 @@ def _handle_undeliverable_message(
         return True
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(60)
-async def test_actor_log_streaming() -> None:
+async def test_actor_log_streaming(v1: bool) -> None:
     # Save original file descriptors
     original_stdout_fd = os.dup(1)  # stdout
     original_stderr_fd = os.dup(2)  # stderr
@@ -631,7 +634,7 @@ async def test_actor_log_streaming() -> None:
             sys.stderr = stderr_file
 
             try:
-                pm = spawn_procs_on_this_host(v1=False, per_host={"gpus": 2})
+                pm = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
                 am = pm.spawn("printer", Printer)
 
                 # Disable streaming logs to client
@@ -671,7 +674,10 @@ async def test_actor_log_streaming() -> None:
                     await am.print.call("has print streaming too")
                     await am.log.call("has log streaming as level matched")
 
-                await pm.stop()
+                if not v1:
+                    await pm.stop()
+                else:
+                    await asyncio.sleep(1)
 
                 # Flush all outputs
                 stdout_file.flush()
@@ -752,8 +758,9 @@ async def test_actor_log_streaming() -> None:
             pass
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(120)
-async def test_alloc_based_log_streaming() -> None:
+async def test_alloc_based_log_streaming(v1: bool) -> None:
     """Test both AllocHandle.stream_logs = False and True cases."""
 
     async def test_stream_logs_case(stream_logs: bool, test_name: str) -> None:
@@ -770,23 +777,45 @@ async def test_stream_logs_case(stream_logs: bool, test_name: str) -> None:
 
                 try:
                     # Create proc mesh with custom stream_logs setting
-                    host_mesh = create_local_host_mesh()
-                    alloc_handle = host_mesh._alloc(hosts=1, gpus=2)
+                    if not v1:
+                        host_mesh = create_local_host_mesh()
+                        alloc_handle = host_mesh._alloc(hosts=1, gpus=2)
+
+                        # Override the stream_logs setting
+                        custom_alloc_handle = AllocHandle(
+                            alloc_handle._hy_alloc, alloc_handle._extent, stream_logs
+                        )
+
+                        pm = ProcMesh.from_alloc(custom_alloc_handle)
+                    else:
 
-                    # Override the stream_logs setting
-                    custom_alloc_handle = AllocHandle(
-                        alloc_handle._hy_alloc, alloc_handle._extent, stream_logs
-                    )
+                        class ProcessAllocatorStreamLogs(ProcessAllocator):
+                            def _stream_logs(self) -> bool:
+                                return stream_logs
+
+                        alloc = ProcessAllocatorStreamLogs(*_get_bootstrap_args())
+
+                        host_mesh = HostMeshV1.allocate_nonblocking(
+                            "host",
+                            Extent(["hosts"], [1]),
+                            alloc,
+                            bootstrap_cmd=_bootstrap_cmd(),
+                        )
+
+                        pm = host_mesh.spawn_procs(name="proc", per_host={"gpus": 2})
 
-                    pm = ProcMesh.from_alloc(custom_alloc_handle)
                     am = pm.spawn("printer", Printer)
 
                     await pm.initialized
 
                     for _ in range(5):
                         await am.print.call(f"{test_name} print streaming")
 
-                    await pm.stop()
+                    if not v1:
+                        await pm.stop()
+                    else:
+                        # Wait for at least the aggregation window (3 seconds)
+                        await asyncio.sleep(5)
 
                     # Flush all outputs
                     stdout_file.flush()
@@ -810,18 +839,18 @@ async def test_stream_logs_case(stream_logs: bool, test_name: str) -> None:
                 # When stream_logs=False, logs should not be streamed to client
                 assert not re.search(
                     rf"similar log lines.*{test_name} print streaming", stdout_content
-                ), f"stream_logs=True case: {stdout_content}"
+                ), f"stream_logs=False case: {stdout_content}"
                 assert re.search(
                     rf"{test_name} print streaming", stdout_content
-                ), f"stream_logs=True case: {stdout_content}"
+                ), f"stream_logs=False case: {stdout_content}"
             else:
                 # When stream_logs=True, logs should be streamed to client (no aggregation by default)
                 assert re.search(
                     rf"similar log lines.*{test_name} print streaming", stdout_content
-                ), f"stream_logs=False case: {stdout_content}"
+                ), f"stream_logs=True case: {stdout_content}"
                 assert not re.search(
                     rf"\[[0-9]\]{test_name} print streaming", stdout_content
-                ), f"stream_logs=False case: {stdout_content}"
+                ), f"stream_logs=True case: {stdout_content}"
 
         finally:
             # Ensure file descriptors are restored even if something goes wrong
@@ -836,8 +865,9 @@ async def test_stream_logs_case(stream_logs: bool, test_name: str) -> None:
     await test_stream_logs_case(True, "stream_logs_true")
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(60)
-async def test_logging_option_defaults() -> None:
+async def test_logging_option_defaults(v1: bool) -> None:
     # Save original file descriptors
     original_stdout_fd = os.dup(1)  # stdout
     original_stderr_fd = os.dup(2)  # stderr
@@ -864,14 +894,18 @@ async def test_logging_option_defaults() -> None:
             sys.stderr = stderr_file
 
             try:
-                pm = spawn_procs_on_this_host(v1=False, per_host={"gpus": 2})
+                pm = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
                 am = pm.spawn("printer", Printer)
 
                 for _ in range(5):
                     await am.print.call("print streaming")
                     await am.log.call("log streaming")
 
-                await pm.stop()
+                if not v1:
+                    await pm.stop()
+                else:
+                    # Wait for > default aggregation window (3 seconds)
+                    await asyncio.sleep(5)
 
                 # Flush all outputs
                 stdout_file.flush()
@@ -949,7 +983,8 @@ def __init__(self):
 
 # oss_skip: pytest keeps complaining about mocking get_ipython module
 @pytest.mark.oss_skip
-async def test_flush_called_only_once() -> None:
+@pytest.mark.parametrize("v1", [True, False])
+async def test_flush_called_only_once(v1: bool) -> None:
     """Test that flush is called only once when ending an ipython cell"""
     mock_ipython = MockIPython()
     with unittest.mock.patch(
@@ -961,8 +996,8 @@ async def test_flush_called_only_once() -> None:
         "monarch._src.actor.logging.flush_all_proc_mesh_logs"
     ) as mock_flush:
         # Create 2 proc meshes with a large aggregation window
-        pm1 = this_host().spawn_procs(per_host={"gpus": 2})
-        _ = this_host().spawn_procs(per_host={"gpus": 2})
+        pm1 = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
+        _ = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
         # flush not yet called unless post_run_cell
         assert mock_flush.call_count == 0
         assert mock_ipython.events.registers == 0
@@ -976,8 +1011,9 @@ async def test_flush_called_only_once() -> None:
 
 # oss_skip: pytest keeps complaining about mocking get_ipython module
 @pytest.mark.oss_skip
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(180)
-async def test_flush_logs_ipython() -> None:
+async def test_flush_logs_ipython(v1: bool) -> None:
     """Test that logs are flushed when get_ipython is available and post_run_cell event is triggered."""
     # Save original file descriptors
     original_stdout_fd = os.dup(1)  # stdout
@@ -1003,8 +1039,8 @@ async def test_flush_logs_ipython() -> None:
                 ), unittest.mock.patch("monarch._src.actor.logging.IN_IPYTHON", True):
                     # Make sure we can register and unregister callbacks
                     for _ in range(3):
-                        pm1 = this_host().spawn_procs(per_host={"gpus": 2})
-                        pm2 = this_host().spawn_procs(per_host={"gpus": 2})
+                        pm1 = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
+                        pm2 = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
                         am1 = pm1.spawn("printer", Printer)
                         am2 = pm2.spawn("printer", Printer)
 
@@ -1108,8 +1144,9 @@ async def test_flush_logs_fast_exit() -> None:
     ), process.stdout
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(60)
-async def test_flush_on_disable_aggregation() -> None:
+async def test_flush_on_disable_aggregation(v1: bool) -> None:
     """Test that logs are flushed when disabling aggregation.
 
     This tests the corner case: "Make sure we flush whatever in the aggregators before disabling aggregation."
@@ -1130,7 +1167,7 @@ async def test_flush_on_disable_aggregation() -> None:
             sys.stdout = stdout_file
 
             try:
-                pm = this_host().spawn_procs(per_host={"gpus": 2})
+                pm = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
                 am = pm.spawn("printer", Printer)
 
                 # Set a long aggregation window to ensure logs aren't flushed immediately
@@ -1151,7 +1188,11 @@ async def test_flush_on_disable_aggregation() -> None:
                 for _ in range(5):
                     await am.print.call("single log line")
 
-                await pm.stop()
+                if not v1:
+                    await pm.stop()
+                else:
+                    # Wait for > default aggregation window (3 secs)
+                    await asyncio.sleep(5)
 
                 # Flush all outputs
                 stdout_file.flush()
@@ -1197,14 +1238,15 @@ async def test_flush_on_disable_aggregation() -> None:
             pass
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(120)
-async def test_multiple_ongoing_flushes_no_deadlock() -> None:
+async def test_multiple_ongoing_flushes_no_deadlock(v1: bool) -> None:
     """
     The goal is to make sure when a user sends multiple sync flushes, we are not deadlocked.
     Because now a flush call is purely sync, it is very easy to get into a deadlock.
     So we assert the last flush call will not get into such a state.
     """
-    pm = this_host().spawn_procs(per_host={"gpus": 4})
+    pm = spawn_procs_on_this_host(v1, per_host={"gpus": 4})
     am = pm.spawn("printer", Printer)
 
     # Generate some logs that will be aggregated but not flushed immediately
@@ -1227,8 +1269,9 @@ async def test_multiple_ongoing_flushes_no_deadlock() -> None:
     futures[-1].get()
 
 
+@pytest.mark.parametrize("v1", [True, False])
 @pytest.mark.timeout(60)
-async def test_adjust_aggregation_window() -> None:
+async def test_adjust_aggregation_window(v1: bool) -> None:
     """Test that the flush deadline is updated when the aggregation window is adjusted.
 
     This tests the corner case: "This can happen if the user has adjusted the aggregation window."
@@ -1249,7 +1292,7 @@ async def test_adjust_aggregation_window() -> None:
             sys.stdout = stdout_file
 
             try:
-                pm = this_host().spawn_procs(per_host={"gpus": 2})
+                pm = spawn_procs_on_this_host(v1, per_host={"gpus": 2})
                 am = pm.spawn("printer", Printer)
 
                 # Set a long aggregation window initially
@@ -1267,7 +1310,11 @@ async def test_adjust_aggregation_window() -> None:
                 for _ in range(3):
                     await am.print.call("second batch of logs")
 
-                await pm.stop()
+                if not v1:
+                    await pm.stop()
+                else:
+                    # Wait for > aggregation window (2 secs)
+                    await asyncio.sleep(4)
 
                 # Flush all outputs
                 stdout_file.flush()