Skip to content

Commit 8d46493

Browse files
authored
[Bugfix] Adding a check for name length (#273)
* shortening names so it doesn't break runs Signed-off-by: Zoey Zhang <[email protected]> --------- Signed-off-by: Zoey Zhang <[email protected]>
1 parent 16895e0 commit 8d46493

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

nemo_run/core/execution/dgxcloud.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,12 @@ def create_training_job(
225225
if self.nodes < 1:
226226
raise ValueError("Node count must be at least 1")
227227

228+
if len(name) >= 35:
229+
logger.warning(
230+
"Training name can only be max 35 characters. Shortening name to 35 characters..."
231+
)
232+
name = name[:34]
233+
228234
# Common payload elements
229235
common_payload = {
230236
"name": name,
@@ -265,6 +271,7 @@ def create_training_job(
265271
headers = self._default_headers(token=token)
266272
response = requests.post(url, json=payload, headers=headers)
267273

274+
logger.info(json.dumps(payload))
268275
logger.debug(
269276
"Created %s job; response code=%s, content=%s",
270277
"distributed" if self.nodes > 1 else "training",
@@ -276,6 +283,7 @@ def create_training_job(
276283

277284
def launch(self, name: str, cmd: list[str]) -> tuple[str, str]:
278285
name = name.replace("_", "-").replace(".", "-").lower() # to meet K8s requirements
286+
logger.info(f"workload name:{name}")
279287
token = self.get_auth_token()
280288
if not token:
281289
raise RuntimeError("Failed to get auth token")

0 commit comments

Comments
 (0)