link launch and sync conda/workspace locations (#742)

James Sun · facebook-github-bot · commit fb6bdc5a0c3b · 2025-08-14T00:40:22.000-07:00
Summary: Pull Request resolved: #742 X-link: #742 Make sure the conda/workspace locations during launch map with the locations when we sync. Reviewed By: kiukchung Differential Revision: D79516268 fbshipit-source-id: 80ed66f3dfc04b35c2fd66bc20ad910bdb800070
diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py
@@ -8,7 +8,6 @@
 
 import asyncio
 import logging
-import os
 import sys
 import threading
 import warnings
@@ -70,6 +69,8 @@
 from monarch._src.actor.endpoint import endpoint
 from monarch._src.actor.future import DeprecatedNotAFuture, Future
 from monarch._src.actor.shape import MeshTrait
+from monarch.tools.config import Workspace
+from monarch.tools.utils import conda as conda_utils
 
 HAS_TENSOR_ENGINE = False
 try:
@@ -369,7 +370,10 @@ def rank_tensors(self) -> Dict[str, "Tensor"]:
         return self._device_mesh.ranks
 
     async def sync_workspace(
-        self, conda: bool = False, auto_reload: bool = False
+        self,
+        workspace: Workspace = None,
+        conda: bool = False,
+        auto_reload: bool = False,
     ) -> None:
         if self._code_sync_client is None:
             self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
@@ -382,21 +386,21 @@ async def sync_workspace(
         # The workspace shape (i.e. only perform one rsync per host).
         assert set(self._shape.labels).issubset({"gpus", "hosts"})
 
-        # TODO(agallagher): Is there a better way to infer/set the local
-        # workspace dir, rather than use PWD?
-        workspaces = [
-            WorkspaceConfig(
-                local=Path(os.getcwd()),
-                remote=RemoteWorkspace(
-                    location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
-                    shape=WorkspaceShape.shared("gpus"),
+        workspaces = []
+        if workspace is not None:
+            workspaces.append(
+                WorkspaceConfig(
+                    local=Path(workspace),
+                    remote=RemoteWorkspace(
+                        location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
+                        shape=WorkspaceShape.shared("gpus"),
+                    ),
+                    method=CodeSyncMethod.Rsync,
                 ),
-                method=CodeSyncMethod.Rsync,
-            ),
-        ]
+            )
 
         # If `conda` is set, also sync the currently activated conda env.
-        conda_prefix = os.environ.get("CONDA_PREFIX")
+        conda_prefix = conda_utils.active_env_dir()
         if conda and conda_prefix is not None:
             workspaces.append(
                 WorkspaceConfig(
diff --git a/python/monarch/tools/config/__init__.py b/python/monarch/tools/config/__init__.py
@@ -6,9 +6,11 @@
 
 # pyre-strict
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, TYPE_CHECKING
 
-from torchx.specs import Role
+# Defer the import of Role to avoid requiring torchx at import time
+if TYPE_CHECKING:
+    from torchx.specs import Role
 
 
 NOT_SET: str = "__NOT_SET__"
@@ -20,10 +22,18 @@ class UnnamedAppDef:
     A TorchX AppDef without a name.
     """
 
-    roles: List[Role] = field(default_factory=list)
+    roles: List["Role"] = field(default_factory=list)
     metadata: Dict[str, str] = field(default_factory=dict)
 
 
+# TODO: provide a proper Workspace class to support
+#  - multiple workspaces
+#  - empty workspaces
+#  - no workspace
+#  - experimental directories
+Workspace = str | None
+
+
 @dataclass
 class Config:
     """
@@ -32,6 +42,6 @@ class Config:
 
     scheduler: str = NOT_SET
     scheduler_args: dict[str, Any] = field(default_factory=dict)
-    workspace: Optional[str] = None
+    workspace: Workspace = None
     dryrun: bool = False
     appdef: UnnamedAppDef = field(default_factory=UnnamedAppDef)
diff --git a/python/monarch/tools/config/defaults.py b/python/monarch/tools/config/defaults.py
@@ -8,10 +8,10 @@
 
 """Defines defaults for ``monarch.tools``"""
 
-from typing import Callable, Optional
+from typing import Callable
 
 from monarch.tools.components import hyperactor
-from monarch.tools.config import Config, UnnamedAppDef
+from monarch.tools.config import Config, UnnamedAppDef, Workspace
 
 from torchx import specs
 from torchx.schedulers import (
@@ -40,7 +40,7 @@ def scheduler_factories() -> dict[str, SchedulerFactory]:
     }
 
 
-def config(scheduler: str, workspace: Optional[str] = None) -> Config:
+def config(scheduler: str, workspace: Workspace = None) -> Config:
     """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
     return Config(scheduler=scheduler, workspace=workspace)
 
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
@@ -37,6 +37,7 @@
     local_proc_mesh,
     proc_mesh,
 )
+from monarch.tools.config import defaults
 from typing_extensions import assert_type
 
 
@@ -950,6 +951,49 @@ async def test_same_actor_twice() -> None:
     ), f"Expected error message about duplicate actor name, got: {error_msg}"
 
 
+class LsActor(Actor):
+    def __init__(self, workspace: str):
+        self.workspace = workspace
+
+    @endpoint
+    async def ls(self) -> list[str]:
+        return os.listdir(self.workspace)
+
+
+async def test_sync_workspace() -> None:
+    pm = await proc_mesh(gpus=1)
+
+    # create two workspaces: one for local and one for remote
+    with tempfile.TemporaryDirectory() as workspace_src, tempfile.TemporaryDirectory() as workspace_dst, unittest.mock.patch.dict(
+        os.environ, {"WORKSPACE_DIR": workspace_dst}
+    ):
+        os.environ["WORKSPACE_DIR"] = workspace_dst
+        config = defaults.config("slurm", workspace_src)
+        await pm.sync_workspace(
+            workspace=config.workspace, conda=False, auto_reload=True
+        )
+
+        # now file in remote workspace initially
+        am = await pm.spawn("ls", LsActor, workspace_dst)
+        for item in list(am.ls.call().get()):
+            assert len(item[1]) == 0
+
+        # write a file to local workspace
+        file_path = os.path.join(workspace_src, "new_file")
+        with open(file_path, "w") as f:
+            f.write("hello world")
+            f.flush()
+
+        # force a sync and it should populate on the dst workspace
+        await pm.sync_workspace(config.workspace, conda=False, auto_reload=True)
+        for item in list(am.ls.call().get()):
+            assert len(item[1]) == 1
+            assert item[1][0] == "new_file"
+            file_path = os.path.join(workspace_dst, item[1][0])
+            with open(file_path, "r") as f:
+                assert f.readline() == "hello world"
+
+
 class TestActorMeshStop(unittest.IsolatedAsyncioTestCase):
     async def test_actor_mesh_stop(self) -> None:
         pm = proc_mesh(gpus=2)