Skip to content

Commit 414f007

Browse files
authored
Add additional debug to DGXC data mover (#215)
The data mover for DGXC needs debug statements to make it easier to traige issues when workloads fail. Signed-off-by: Robert Clark <[email protected]>
1 parent 2d3271e commit 414f007

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
import logging
44
import os
55
import subprocess
6+
import tempfile
7+
import time
68
from dataclasses import dataclass, field
79
from enum import Enum
810
from pathlib import Path
9-
import time
10-
import tempfile
1111
from typing import Any, Optional, Type
1212

1313
import requests
1414
from invoke.context import Context
1515

16+
from nemo_run.config import get_nemorun_home
1617
from nemo_run.core.execution.base import Executor, ExecutorMacros
1718
from nemo_run.core.packaging.base import Packager
1819
from nemo_run.core.packaging.git import GitArchivePackager
19-
from nemo_run.config import get_nemorun_home
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -170,6 +170,8 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
170170
workload_id = resp_json["workloadId"]
171171
status = DGXCloudState(resp_json["actualPhase"])
172172

173+
logger.info(f"Successfully created data movement workload {workload_id} on DGXCloud")
174+
173175
while status in [
174176
DGXCloudState.PENDING,
175177
DGXCloudState.CREATING,
@@ -178,9 +180,12 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
178180
]:
179181
time.sleep(sleep)
180182
status = self.status(workload_id)
183+
logger.debug(
184+
f"Polling data movement workload {workload_id}'s status. Current status is: {status}"
185+
)
181186

182187
if status is not DGXCloudState.COMPLETED:
183-
raise RuntimeError("Failed to move data to PVC")
188+
raise RuntimeError(f"Failed to move data to PVC. Workload status is {status}")
184189

185190
resp = self.delete_workload(token, workload_id)
186191
if resp.status_code >= 200 and resp.status_code < 300:

0 commit comments

Comments
 (0)