Skip to content

Commit 77c8ba3

Browse files
authored
Ensure job directory creation for various schedulers (#216)
* Ensure job directory creation for various schedulers Signed-off-by: Hemil Desai <[email protected]> * fix Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: Hemil Desai <[email protected]>
1 parent fa89ad7 commit 77c8ba3

File tree

5 files changed

+5
-0
lines changed

5 files changed

+5
-0
lines changed

nemo_run/core/execution/docker.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ def get_containers(self, client: "DockerClient") -> list["Container"]:
350350
def save(self) -> None:
351351
apps = {}
352352
if not os.path.isfile(DOCKER_JOB_DIRS):
353+
os.makedirs(os.path.dirname(DOCKER_JOB_DIRS), exist_ok=True)
353354
Path(DOCKER_JOB_DIRS).touch()
354355

355356
with open(DOCKER_JOB_DIRS, "r+") as f:

nemo_run/run/torchx_backend/schedulers/dgxcloud.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ def _save_job_dir(app_id: str, job_status: str, executor: DGXCloudExecutor) -> N
204204
original_apps = {}
205205
os.makedirs(os.path.dirname(DGX_JOB_DIRS), exist_ok=True)
206206
if not os.path.isfile(DGX_JOB_DIRS):
207+
os.makedirs(os.path.dirname(DGX_JOB_DIRS), exist_ok=True)
207208
Path(DGX_JOB_DIRS).touch()
208209

209210
serializer = ZlibJSONSerializer()

nemo_run/run/torchx_backend/schedulers/local.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def create_scheduler(
184184
def _save_job_dir(apps: dict[str, _LocalAppDef]) -> None:
185185
original_apps = {}
186186
if not os.path.isfile(LOCAL_JOB_DIRS):
187+
os.makedirs(os.path.dirname(LOCAL_JOB_DIRS), exist_ok=True)
187188
Path(LOCAL_JOB_DIRS).touch()
188189

189190
with open(LOCAL_JOB_DIRS, "r+") as f:

nemo_run/run/torchx_backend/schedulers/skypilot.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ def create_scheduler(session_name: str, **kwargs: Any) -> SkypilotScheduler:
227227
def _save_job_dir(app_id: str, job_status: str, log_dir: str) -> None:
228228
original_apps = {}
229229
if not os.path.isfile(SKYPILOT_JOB_DIRS):
230+
os.makedirs(os.path.dirname(SKYPILOT_JOB_DIRS), exist_ok=True)
230231
Path(SKYPILOT_JOB_DIRS).touch()
231232

232233
with open(SKYPILOT_JOB_DIRS, "r+") as f:

nemo_run/run/torchx_backend/schedulers/slurm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def create_scheduler(session_name: str, **kwargs: Any) -> SlurmTunnelScheduler:
374374
def _save_job_dir(
375375
job_id: str, local_job_dir: str, tunnel: SSHTunnel | LocalTunnel, ls_term: str
376376
) -> None:
377+
os.makedirs(os.path.dirname(SLURM_JOB_DIRS), exist_ok=True)
377378
with open(SLURM_JOB_DIRS, "a+") as f:
378379
f.write(
379380
f"{job_id} = {ls_term},{local_job_dir},{tunnel.__class__.__name__},{json.dumps(asdict(tunnel))}\n"

0 commit comments

Comments
 (0)