Skip to content

Commit c35302a

Browse files
committed
Add additional debug to DGXC data mover
The data mover for DGXC needs debug statements to make it easier to traige issues when workloads fail. Signed-Off-By: Robert Clark <[email protected]>
1 parent 2d3271e commit c35302a

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
import logging
44
import os
55
import subprocess
6+
import tempfile
7+
import time
68
from dataclasses import dataclass, field
79
from enum import Enum
810
from pathlib import Path
9-
import time
10-
import tempfile
1111
from typing import Any, Optional, Type
1212

1313
import requests
1414
from invoke.context import Context
1515

16+
from nemo_run.config import get_nemorun_home
1617
from nemo_run.core.execution.base import Executor, ExecutorMacros
1718
from nemo_run.core.packaging.base import Packager
1819
from nemo_run.core.packaging.git import GitArchivePackager
19-
from nemo_run.config import get_nemorun_home
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -170,6 +170,8 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
170170
workload_id = resp_json["workloadId"]
171171
status = DGXCloudState(resp_json["actualPhase"])
172172

173+
logger.info(f"Successfully created data movement workload {workload_id} on DGXCloud")
174+
173175
while status in [
174176
DGXCloudState.PENDING,
175177
DGXCloudState.CREATING,
@@ -178,9 +180,12 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
178180
]:
179181
time.sleep(sleep)
180182
status = self.status(workload_id)
183+
logger.debug(
184+
f"Polling data movement workload {workload_id}'s status. Current status is: {status}"
185+
)
181186

182187
if status is not DGXCloudState.COMPLETED:
183-
raise RuntimeError("Failed to move data to PVC")
188+
raise RuntimeError(f"Failed to move data to PVC. Workload status is {status}")
184189

185190
resp = self.delete_workload(token, workload_id)
186191
if resp.status_code >= 200 and resp.status_code < 300:

0 commit comments

Comments
 (0)