Skip to content

Commit 34565ae

Browse files
committed
Merge remote-tracking branch 'upstream/main' into rhoai-3.2
2 parents 7f0ff56 + 9598c5c commit 34565ae

File tree

1 file changed

+33
-3
lines changed

1 file changed

+33
-3
lines changed

tests/trainer/resources/mnist.ipynb

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -505,9 +505,39 @@
505505
},
506506
"outputs": [],
507507
"source": [
508-
"# Wait for the running status, then completion.\n",
509-
"client.wait_for_job_status(name=job_name, status={\"Running\"})\n",
510-
"client.wait_for_job_status(name=job_name, status={\"Complete\"})"
508+
"# Wait for the running status, then wait for completion or failure\n",
509+
"client.wait_for_job_status(name=job_name, status={\"Running\"}, timeout=300)\n",
510+
"client.wait_for_job_status(name=job_name, status={\"Complete\", \"Failed\"}, timeout=900)\n",
511+
"\n",
512+
"# Get job details and logs\n",
513+
"job = client.get_job(name=job_name)\n",
514+
"pod_logs = client.get_job_logs(name=job_name, follow=False)\n",
515+
"\n",
516+
"# Collect all log lines from the generator into a list\n",
517+
"logs = list(pod_logs)\n",
518+
"log_text = \"\\n\".join(str(line) for line in logs)\n",
519+
"\n",
520+
"\n",
521+
"print(f\"Training job final status: {job.status}\")\n",
522+
"\n",
523+
"# Check 1: Job status must not be \"Failed\" \n",
524+
"if job.status == \"Failed\":\n",
525+
" print(f\"ERROR: Training job '{job_name}' has Failed status\")\n",
526+
" print(\"Last 30 lines of logs:\")\n",
527+
" for line in logs[-30:]:\n",
528+
" print(line)\n",
529+
" raise RuntimeError(f\"Training job '{job_name}' failed\")\n",
530+
"\n",
531+
"# Check 2: Look for the training completion message in logs\n",
532+
"# This is critical because the training script may catch exceptions and exit 0\n",
533+
"if \"Training is finished\" not in log_text:\n",
534+
" print(f\"ERROR: Training completion message not found in logs\")\n",
535+
" print(\"Last 50 lines of logs:\")\n",
536+
" for line in logs[-50:]:\n",
537+
" print(line)\n",
538+
" raise RuntimeError(f\"Training did not complete successfully - missing completion message\")\n",
539+
"\n",
540+
"print(f\"✓ Training job '{job_name}' completed successfully\")"
511541
]
512542
},
513543
{

0 commit comments

Comments
 (0)