Skip to content

Commit 87338b5

Browse files
committed
Add additional debug to DGXC data mover
The data mover for DGXC needs debug statements to make it easier to traige issues when workloads fail. Signed-Off-By: Robert Clark <[email protected]>
1 parent 2d3271e commit 87338b5

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
import logging
44
import os
55
import subprocess
6+
import tempfile
7+
import time
68
from dataclasses import dataclass, field
79
from enum import Enum
810
from pathlib import Path
9-
import time
10-
import tempfile
1111
from typing import Any, Optional, Type
1212

1313
import requests
1414
from invoke.context import Context
1515

16+
from nemo_run.config import get_nemorun_home
1617
from nemo_run.core.execution.base import Executor, ExecutorMacros
1718
from nemo_run.core.packaging.base import Packager
1819
from nemo_run.core.packaging.git import GitArchivePackager
19-
from nemo_run.config import get_nemorun_home
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -170,6 +170,10 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
170170
workload_id = resp_json["workloadId"]
171171
status = DGXCloudState(resp_json["actualPhase"])
172172

173+
logger.info(
174+
f"Successfully created data movement workload {workload_id} on DGXCloud"
175+
)
176+
173177
while status in [
174178
DGXCloudState.PENDING,
175179
DGXCloudState.CREATING,
@@ -178,9 +182,12 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
178182
]:
179183
time.sleep(sleep)
180184
status = self.status(workload_id)
185+
logger.debug(
186+
f"Polling data movement workload {workload_id}'s status. Current status is: {status}"
187+
)
181188

182189
if status is not DGXCloudState.COMPLETED:
183-
raise RuntimeError("Failed to move data to PVC")
190+
raise RuntimeError(f"Failed to move data to PVC. Workload status is {status}")
184191

185192
resp = self.delete_workload(token, workload_id)
186193
if resp.status_code >= 200 and resp.status_code < 300:

0 commit comments

Comments
 (0)