refactor data mover: switch to BatchJob with auto cleanup and sleep after every run

prekshivyas · prekshivyas · commit c855d6392a04 · 2025-06-08T12:01:47.000-07:00
Signed-off-by: prekshivyas &lt;prekhsivyas@gmail.com&gt;
diff --git a/nemo_run/core/execution/lepton.py b/nemo_run/core/execution/lepton.py
@@ -13,15 +13,12 @@
 from invoke.context import Context
 from leptonai.api.v2.client import APIClient
 from leptonai.api.v1.types.affinity import LeptonResourceAffinity
-from leptonai.api.v1.types.common import LeptonVisibility, Metadata
+from leptonai.api.v1.types.common import Metadata, LeptonVisibility
 from leptonai.api.v1.types.dedicated_node_group import DedicatedNodeGroup
 from leptonai.api.v1.types.deployment import (
     EnvVar,
     LeptonContainer,
-    LeptonDeployment,
-    LeptonDeploymentUserSpec,
     Mount,
-    ResourceRequirement,
 )
 from leptonai.api.v1.types.job import LeptonJob, LeptonJobState, LeptonJobUserSpec
 from leptonai.api.v1.types.replica import Replica
@@ -85,7 +82,7 @@ def copy_directory_data_command(self, local_dir_path: str, dest_path: str) -> Li
             full_command = ["sh", "-c", cmd]
             return full_command
 
-    def move_data(self, sleep: float = 10) -> None:
+    def move_data(self, sleep: float = 10, timeout: int = 600, poll_interval: int = 5) -> None:
         """
         Moves job directory into remote storage and deletes the workload after completion.
         """
@@ -94,34 +91,62 @@ def move_data(self, sleep: float = 10) -> None:
         node_group_id = self._node_group_id(client)
         valid_node_ids = self._valid_node_ids(node_group_id, client)
 
-        spec = LeptonDeploymentUserSpec(
-            container=LeptonContainer(
-                image="busybox:1.37.0",  # Use a very low resource container
-                command=cmd,
-            ),
-            mounts=[Mount(**mount) for mount in self.mounts],
-        )
-        spec.resource_requirement = ResourceRequirement(
+        job_spec = LeptonJobUserSpec(
             resource_shape="cpu.small",
             affinity=LeptonResourceAffinity(
                 allowed_dedicated_node_groups=[node_group_id.metadata.id_],
                 allowed_nodes_in_node_group=valid_node_ids,
             ),
-            min_replicas=1,
-            max_replicas=1,
+            container=LeptonContainer(
+                image="busybox:1.37.0",
+                command=cmd,
+            ),
+            completions=1,
+            parallelism=1,
+            mounts=[Mount(**mount) for mount in self.mounts],
+            ttl_seconds_after_finished=600,
         )
+
         custom_name = f"data-mover-{int(datetime.now().timestamp())}"
 
-        deployment = LeptonDeployment(
+        job = LeptonJob(
             metadata=Metadata(
                 id=custom_name,
                 name=custom_name,
                 visibility=LeptonVisibility("private"),
             ),
-            spec=spec,
+            spec=job_spec,
         )
 
-        client.deployment.create(deployment)
+        response = client.job.create(job)
+        job_id = response.metadata.id_
+
+        start_time = time.time()
+        count = 0
+
+        while True:
+            if time.time() - start_time > timeout:
+                raise TimeoutError(f"Job {job_id} did not complete within {timeout} seconds.")
+            current_job = client.job.get(job_id)
+            current_job_status = current_job.status.state
+            if count > 0 and current_job_status in [LeptonJobState.Completed, LeptonJobState.Failed, LeptonJobState.Unknown]:
+                break
+            count+=1
+            time.sleep(poll_interval)
+
+        if current_job_status != LeptonJobState.Completed:
+            raise RuntimeError(f"Job {job_id} failed with status: {current_job_status}")
+
+        time.sleep(sleep)
+
+        delete_success = client.job.delete(job_id)
+       
+        if delete_success:
+            logging.info(f"Successfully deleted job {job_id}")
+        else:
+            logging.error(f"Failed to delete job {job_id}")
+
+
 
     def _node_group_id(self, client: APIClient) -> DedicatedNodeGroup:
         """
@@ -231,7 +256,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
             f.write(launch_script)
 
         logger.info("Copying experiment directory to remote filesystem")
-        self.move_data()
+        self.move_data(sleep=60)
 
         logger.info("Creating distributed workload")
         job = self.create_lepton_job(name)