disallow logging option for local procs (meta-pytorch#825)

James Sun · facebook-github-bot · commit c6e1668526f0 · 2025-08-11T21:14:25.000-07:00
Summary:

there is no way to tail local procs; simply disallow it; I don't quite
like the new interface introduced. RFC is needed.

Differential Revision: D80063700
diff --git a/hyperactor_mesh/src/logging.rs b/hyperactor_mesh/src/logging.rs
@@ -34,7 +34,6 @@ use hyperactor::Unbind;
 use hyperactor::channel;
 use hyperactor::channel::ChannelAddr;
 use hyperactor::channel::ChannelRx;
-use hyperactor::channel::ChannelTransport;
 use hyperactor::channel::ChannelTx;
 use hyperactor::channel::Rx;
 use hyperactor::channel::Tx;
@@ -543,6 +542,10 @@ impl<T: LogSender + Unpin + 'static, S: io::AsyncWrite + Send + Unpin + 'static>
                 // Since LogSender::send takes &self, we don't need to clone it
                 if let Err(e) = this.log_sender.send(output_target, data_to_send) {
                     tracing::error!("error sending log: {}", e);
+                    return Poll::Ready(Err(io::Error::other(format!(
+                        "error sending write message: {}",
+                        e
+                    ))));
                 }
                 // Return success with the full buffer size
                 Poll::Ready(Ok(buf.len()))
@@ -632,37 +635,15 @@ impl Actor for LogForwardActor {
     async fn new(logging_client_ref: Self::Params) -> Result<Self> {
         let log_channel: ChannelAddr = match std::env::var(BOOTSTRAP_LOG_CHANNEL) {
             Ok(channel) => channel.parse()?,
-            Err(err) => {
-                tracing::debug!(
-                    "log forwarder actor failed to read env var {}: {}",
-                    BOOTSTRAP_LOG_CHANNEL,
-                    err
-                );
-                // TODO: an empty channel to serve
-                ChannelAddr::any(ChannelTransport::Unix)
-            }
+            Err(err) => return Err(err.into()),
         };
         tracing::info!(
             "log forwarder {} serve at {}",
             std::process::id(),
             log_channel
         );
 
-        let rx = match channel::serve(log_channel.clone()).await {
-            Ok((_, rx)) => rx,
-            Err(err) => {
-                // This can happen if we are not spanwed on a separate process like local.
-                // For local mesh, log streaming anyway is not needed.
-                tracing::error!(
-                    "log forwarder actor failed to bootstrap on given channel {}: {}",
-                    log_channel,
-                    err
-                );
-                channel::serve(ChannelAddr::any(ChannelTransport::Unix))
-                    .await?
-                    .1
-            }
-        };
+        let (_, rx) = channel::serve(log_channel.clone()).await?;
         Ok(Self {
             rx,
             logging_client_ref,
@@ -761,15 +742,7 @@ impl Actor for LogFlushActor {
     async fn new(_: ()) -> Result<Self, anyhow::Error> {
         let log_channel: ChannelAddr = match std::env::var(BOOTSTRAP_LOG_CHANNEL) {
             Ok(channel) => channel.parse()?,
-            Err(err) => {
-                tracing::debug!(
-                    "log forwarder actor failed to read env var {}: {}",
-                    BOOTSTRAP_LOG_CHANNEL,
-                    err
-                );
-                // TODO: this should error out; it can only happen with local proc; we need to fix it.
-                ChannelAddr::any(ChannelTransport::Unix)
-            }
+            Err(err) => return Err(err.into()),
         };
         let tx = channel::dial::<LogMessage>(log_channel)?;
 
@@ -993,6 +966,7 @@ mod tests {
 
     use hyperactor::channel;
     use hyperactor::channel::ChannelAddr;
+    use hyperactor::channel::ChannelTransport;
     use hyperactor::channel::ChannelTx;
     use hyperactor::channel::Tx;
     use hyperactor::id;
diff --git a/python/monarch/_src/actor/allocator.py b/python/monarch/_src/actor/allocator.py
@@ -65,6 +65,12 @@ def allocate(self, spec: AllocSpec) -> "AllocHandle":
         """
         return AllocHandle(self.allocate_nonblocking(spec).spawn(), spec.extent)
 
+    def fork_processses(self) -> bool:
+        """
+        Return a boolean indicating whether the allocator forks processes.
+        """
+        return True
+
 
 @final
 class ProcessAllocator(ProcessAllocatorBase, AllocateMixin):
@@ -79,6 +85,9 @@ class LocalAllocator(LocalAllocatorBase, AllocateMixin):
     An allocator that allocates by spawning actors into the current process.
     """
 
+    def fork_processses(self) -> bool:
+        return False
+
 
 @final
 class SimAllocator(SimAllocatorBase, AllocateMixin):
diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py
@@ -133,6 +133,7 @@ def __init__(
         self,
         hy_proc_mesh: "Shared[HyProcMesh]",
         shape: Shape,
+        _fork_processes: bool,
         _device_mesh: Optional["DeviceMesh"] = None,
     ) -> None:
         self._proc_mesh = hy_proc_mesh
@@ -146,6 +147,7 @@ def __init__(
         self._code_sync_client: Optional[CodeSyncMeshClient] = None
         self._logging_mesh_client: Optional[LoggingMeshClient] = None
         self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
+        self._fork_processes = _fork_processes
         self._stopped = False
 
     @property
@@ -163,41 +165,50 @@ async def task() -> Literal[True]:
 
         return Future(coro=task())
 
-    def _init_manager_actors(self, setup: Callable[[], None] | None = None) -> None:
+    def _init_manager_actors(
+        self, setup: Callable[[], None] | None = None, _fork_processes: bool = True
+    ) -> None:
         self._proc_mesh = PythonTask.from_coroutine(
-            self._init_manager_actors_coro(self._proc_mesh, setup)
+            self._init_manager_actors_coro(self._proc_mesh, setup, _fork_processes)
         ).spawn()
 
     async def _init_manager_actors_coro(
         self,
         proc_mesh_: "Shared[HyProcMesh]",
         setup: Callable[[], None] | None = None,
+        _fork_processes: bool = True,
     ) -> "HyProcMesh":
         proc_mesh: HyProcMesh = await proc_mesh_
         # WARNING: it is unsafe to await self._proc_mesh here
         # because self._proc_mesh is the result of this function itself!
 
-        self._logging_mesh_client = await LoggingMeshClient.spawn(proc_mesh=proc_mesh)
-        self._logging_mesh_client.set_mode(
-            stream_to_client=True,
-            aggregate_window_sec=3,
-            level=logging.INFO,
-        )
-        if HAS_IPYTHON and get_ipython() is not None:
-            # For ipython environment, a cell can end fast with threads running in background.
-            # Flush all the ongoing logs proactively to avoid missing logs.
-            assert self._logging_mesh_client is not None
-            logging_client: LoggingMeshClient = self._logging_mesh_client
-            ipython = get_ipython()
+        if _fork_processes:
+            # logging mesh is only makes sense with forked (remote or local) processes
+            self._logging_mesh_client = await LoggingMeshClient.spawn(
+                proc_mesh=proc_mesh
+            )
+            self._logging_mesh_client.set_mode(
+                stream_to_client=True,
+                aggregate_window_sec=3,
+                level=logging.INFO,
+            )
+            if HAS_IPYTHON and get_ipython() is not None:
+                # For ipython environment, a cell can end fast with threads running in background.
+                # Flush all the ongoing logs proactively to avoid missing logs.
+                assert self._logging_mesh_client is not None
+                logging_client: LoggingMeshClient = self._logging_mesh_client
+                ipython = get_ipython()
 
-            # pyre-ignore[21]
-            from IPython.core.interactiveshell import ExecutionResult
+                # pyre-ignore[21]
+                from IPython.core.interactiveshell import ExecutionResult
 
-            # pyre-ignore[11]
-            def flush_logs(_: ExecutionResult) -> None:
-                return Future(coro=logging_client.flush(proc_mesh).spawn().task()).get()
+                # pyre-ignore[11]
+                def flush_logs(_: ExecutionResult) -> None:
+                    return Future(
+                        coro=logging_client.flush(proc_mesh).spawn().task()
+                    ).get()
 
-            ipython.events.register("post_run_cell", flush_logs)
+                ipython.events.register("post_run_cell", flush_logs)
 
         _rdma_manager = (
             # type: ignore[16]
@@ -239,7 +250,12 @@ def _new_with_shape(self, shape: Shape) -> "ProcMesh":
             if self._maybe_device_mesh is None
             else self._device_mesh._new_with_shape(shape)
         )
-        pm = ProcMesh(self._proc_mesh, shape, _device_mesh=device_mesh)
+        pm = ProcMesh(
+            self._proc_mesh,
+            shape,
+            _device_mesh=device_mesh,
+            _fork_processes=self._fork_processes,
+        )
         pm._slice = True
         return pm
 
@@ -284,6 +300,7 @@ def from_alloc(
         alloc: AllocHandle,
         setup: Callable[[], None] | None = None,
         _init_manager_actors: bool = True,
+        _fork_processes: bool = True,
     ) -> "ProcMesh":
         """
         Allocate a process mesh according to the provided alloc.
@@ -311,10 +328,10 @@ async def task() -> HyProcMesh:
             list(alloc._extent.keys()),
             Slice.new_row_major(list(alloc._extent.values())),
         )
-        pm = ProcMesh(PythonTask.from_coroutine(task()).spawn(), shape)
+        pm = ProcMesh(PythonTask.from_coroutine(task()).spawn(), shape, _fork_processes)
 
         if _init_manager_actors:
-            pm._init_manager_actors(setup)
+            pm._init_manager_actors(setup, _fork_processes)
         return pm
 
     def __repr__(self) -> str:
@@ -420,6 +437,11 @@ async def logging_option(
         Returns:
             None
         """
+        if not self._fork_processes:
+            raise RuntimeError(
+                "Logging option is only available for allocators that fork processes. Allocators like LocalAllocator are not supported."
+            )
+
         if level < 0 or level > 255:
             raise ValueError("Invalid logging level: {}".format(level))
         await self.initialized
@@ -510,7 +532,9 @@ def _proc_mesh_from_allocator(
     # in the order of the dimensions.
     spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
     alloc = allocator.allocate(spec)
-    return ProcMesh.from_alloc(alloc, setup, _init_manager_actors)
+    return ProcMesh.from_alloc(
+        alloc, setup, _init_manager_actors, allocator.fork_processses()
+    )
 
 
 def proc_mesh(
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
@@ -1117,3 +1117,12 @@ def s(t):
     b = PythonTask.spawn_blocking(lambda: s(0))
     r = PythonTask.select_one([a.task(), b.task()]).block_on()
     assert r == (0, 1)
+
+
+async def test_logging_option_on_local_procs() -> None:
+    proc_mesh = local_proc_mesh(gpus=1)
+    with pytest.raises(
+        RuntimeError,
+        match="Logging option is only available for allocators that fork processes",
+    ):
+        await proc_mesh.logging_option(stream_to_client=True)