Fix formatting

hemildesai · hemildesai · commit 61bb965f8c4d · 2025-01-24T09:27:34.000-08:00
Signed-off-by: Hemil Desai &lt;hemild@nvidia.com&gt;
diff --git a/src/nemo_run/core/execution/dgxcloud.py b/src/nemo_run/core/execution/dgxcloud.py
@@ -3,6 +3,7 @@
 import os
 import subprocess
 from dataclasses import dataclass, field
+from enum import Enum
 from pathlib import Path
 from typing import Any, Optional, Type
 
@@ -18,7 +19,6 @@
 
 logger = logging.getLogger(__name__)
 
-from enum import Enum
 
 class DGXCloudState(Enum):
     CREATING = "Creating"
@@ -92,7 +92,9 @@ def get_project_and_cluster_id(self, token: str) -> tuple[Optional[str], Optiona
                 break
         return project_id, cluster_id
 
-    def create_distributed_job(self, token: str, project_id: str, cluster_id: str, name:str, cmd: list[str]):
+    def create_distributed_job(
+        self, token: str, project_id: str, cluster_id: str, name: str, cmd: list[str]
+    ):
         """
         Creates a distributed PyTorch job using the provided project/cluster IDs.
         """
@@ -136,7 +138,7 @@ def create_distributed_job(self, token: str, project_id: str, cluster_id: str, n
         return response
 
     def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
-        name = name.replace("_", "-") # to meet K8s requirements
+        name = name.replace("_", "-")  # to meet K8s requirements
         token = self.get_auth_token()
         if not token:
             raise RuntimeError("Failed to get auth token")
@@ -184,20 +186,24 @@ def cancel(self, job_id: str):
         if response.status_code >= 200 and response.status_code < 300:
             logger.info(
                 "Successfully cancelled job %s on DGX with response code %d",
-                job_id, response.status_code
+                job_id,
+                response.status_code,
             )
         else:
             logger.error(
                 "Failed to cancel job %s, response code=%d, reason=%s",
-                job_id, response.status_code, response.text
+                job_id,
+                response.status_code,
+                response.text,
             )
 
     @classmethod
     def logs(cls: Type["DGXCloudExecutor"], app_id: str, fallback_path: Optional[str]):
-        logger.warning("Logs not available for DGXCloudExecutor based jobs. Please visit the cluster UI to view the logs.")
+        logger.warning(
+            "Logs not available for DGXCloudExecutor based jobs. Please visit the cluster UI to view the logs."
+        )
 
-    def cleanup(self, handle: str):
-        ...
+    def cleanup(self, handle: str): ...
 
     def assign(
         self,
@@ -212,7 +218,13 @@ def assign(
         self.experiment_id = exp_id
         os.makedirs(self.job_dir, exist_ok=True)
         assert any(
-            map(lambda x: os.path.commonpath([os.path.abspath(x["path"]), os.path.abspath(self.job_dir)]) == os.path.abspath(x["path"]), self.pvcs)
+            map(
+                lambda x: os.path.commonpath(
+                    [os.path.abspath(x["path"]), os.path.abspath(self.job_dir)]
+                )
+                == os.path.abspath(x["path"]),
+                self.pvcs,
+            )
         ), f"Need to specify atleast one PVC containing {self.job_dir}.\nTo update job dir to a PVC path, you can set the NEMORUN_HOME env var."
 
     def package(self, packager: Packager, job_name: str):
diff --git a/src/nemo_run/run/experiment.py b/src/nemo_run/run/experiment.py
@@ -183,7 +183,13 @@ class Experiment(ConfigurableMixin):
 nemo experiment logs {exp_id} 0
 nemo experiment cancel {exp_id} 0
 """
-    _PARALLEL_SUPPORTED_EXECUTORS = (SlurmExecutor, LocalExecutor, SkypilotExecutor, DockerExecutor, DGXCloudExecutor)
+    _PARALLEL_SUPPORTED_EXECUTORS = (
+        SlurmExecutor,
+        LocalExecutor,
+        SkypilotExecutor,
+        DockerExecutor,
+        DGXCloudExecutor,
+    )
     _DETACH_SUPPORTED_EXECUTORS = (SlurmExecutor, SkypilotExecutor, DGXCloudExecutor)
     _DEPENDENCY_SUPPORTED_EXECUTORS = (SlurmExecutor,)
     _RUNNER_DEPENDENT_EXECUTORS = (LocalExecutor,)
diff --git a/src/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/src/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
@@ -60,13 +60,14 @@ class DGXRequest:
     Wrapper around the torchx AppDef and the DGX executor.
     This object is used to store job submission info for the scheduler.
     """
+
     app: AppDef
     executor: DGXCloudExecutor
     cmd: list[str]
     name: str
 
 
-class DGXCloudScheduler(SchedulerMixin, Scheduler[dict[str, str]]): # type: ignore
+class DGXCloudScheduler(SchedulerMixin, Scheduler[dict[str, str]]):  # type: ignore
     def __init__(self, session_name: str) -> None:
         super().__init__("dgx", session_name)
 
@@ -76,11 +77,11 @@ def _run_opts(self) -> runopts:
             "job_dir",
             type_=str,
             help="The directory to place the job code and outputs."
-                 " The directory must not exist and will be created.",
+            " The directory must not exist and will be created.",
         )
         return opts
 
-    def _submit_dryrun( # type: ignore
+    def _submit_dryrun(  # type: ignore
         self,
         app: AppDef,
         cfg: Executor,
@@ -100,7 +101,7 @@ def _submit_dryrun( # type: ignore
         return AppDryRunInfo(
             DGXRequest(app=app, executor=executor, cmd=cmd, name=role.name),
             # Minimal function to show the config, if any
-            lambda req: f"DGX job for app: {req.app.name}, cmd: {' '.join(cmd)}, executor: {executor}"
+            lambda req: f"DGX job for app: {req.app.name}, cmd: {' '.join(cmd)}, executor: {executor}",
         )
 
     def schedule(self, dryrun_info: AppDryRunInfo[DGXRequest]) -> str:
@@ -148,20 +149,15 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
             RoleStatus(
                 role_name,
                 replicas=[
-                    ReplicaStatus(
-                        id=0,
-                        role=role_name,
-                        state=AppState.SUBMITTED,
-                        hostname=""
-                    )
+                    ReplicaStatus(id=0, role=role_name, state=AppState.SUBMITTED, hostname="")
                 ],
             )
         ]
 
         if not job_info:
             return None
 
-        executor: DGXCloudExecutor = job_info.get("executor", None) # type: ignore
+        executor: DGXCloudExecutor = job_info.get("executor", None)  # type: ignore
         if not executor:
             return None
 
@@ -175,7 +171,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
             roles_statuses=roles_statuses,
             state=app_state,
             msg="",
-            ui_url=f"{executor.base_url}/workloads/distributed/{job_id}"
+            ui_url=f"{executor.base_url}/workloads/distributed/{job_id}",
         )
 
     def _cancel_existing(self, app_id: str) -> None:
@@ -185,7 +181,7 @@ def _cancel_existing(self, app_id: str) -> None:
         stored_data = _get_job_dirs()
         job_info = stored_data.get(app_id)
         _, _, job_id = app_id.split("___")
-        executor: DGXCloudExecutor = job_info.get("executor", None) # type: ignore
+        executor: DGXCloudExecutor = job_info.get("executor", None)  # type: ignore
         if not executor:
             return None
         executor.delete(job_id)
@@ -219,7 +215,9 @@ def _save_job_dir(app_id: str, job_status: str, executor: DGXCloudExecutor) -> N
 
         app = {
             "job_status": job_status,
-            "executor": serializer.serialize(fdl_dc.convert_dataclasses_to_configs(executor, allow_post_init=True)),
+            "executor": serializer.serialize(
+                fdl_dc.convert_dataclasses_to_configs(executor, allow_post_init=True)
+            ),
         }
         original_apps[app_id] = app