Skip to content

Commit 9512c3b

Browse files
authored
Making job names match Run:ai requirements and making errors more descriptive (#255)
* adding response reasons to errors and making name all lower case before launching jobs Signed-off-by: Zoey Zhang <[email protected]> * standarizing to use reason Signed-off-by: Zoey Zhang <[email protected]> --------- Signed-off-by: Zoey Zhang <[email protected]>
1 parent 75bc3f5 commit 9512c3b

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
163163
resp = self.create_data_mover_workload(token, project_id, cluster_id)
164164
if resp.status_code not in [200, 202]:
165165
raise RuntimeError(
166-
f"Failed to create data mover workload, status_code={resp.status_code}"
166+
f"Failed to create data mover workload, status_code={resp.status_code}, reason={resp.text}"
167167
)
168168

169169
resp_json = resp.json()
@@ -240,7 +240,7 @@ def create_distributed_job(self, token: str, project_id: str, cluster_id: str, n
240240
return response
241241

242242
def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
243-
name = name.replace("_", "-").replace(".", "-") # to meet K8s requirements
243+
name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements
244244
token = self.get_auth_token()
245245
if not token:
246246
raise RuntimeError("Failed to get auth token")
@@ -265,7 +265,9 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
265265
logger.info("Creating distributed workload")
266266
resp = self.create_distributed_job(token, project_id, cluster_id, name)
267267
if resp.status_code not in [200, 202]:
268-
raise RuntimeError(f"Failed to create job, status_code={resp.status_code}")
268+
raise RuntimeError(
269+
f"Failed to create job, status_code={resp.status_code}, reason={resp.text}"
270+
)
269271

270272
r_json = resp.json()
271273
job_id = r_json["workloadId"]

0 commit comments

Comments
 (0)