|
505 | 505 | }, |
506 | 506 | "outputs": [], |
507 | 507 | "source": [ |
508 | | - "# Wait for the running status, then completion.\n", |
509 | | - "client.wait_for_job_status(name=job_name, status={\"Running\"})\n", |
510 | | - "client.wait_for_job_status(name=job_name, status={\"Complete\"})" |
| 508 | + "# Wait for the running status, then wait for completion or failure\n", |
| 509 | + "client.wait_for_job_status(name=job_name, status={\"Running\"}, timeout=300)\n", |
| 510 | + "client.wait_for_job_status(name=job_name, status={\"Complete\", \"Failed\"}, timeout=900)\n", |
| 511 | + "\n", |
| 512 | + "# Get job details and logs\n", |
| 513 | + "job = client.get_job(name=job_name)\n", |
| 514 | + "pod_logs = client.get_job_logs(name=job_name, follow=False)\n", |
| 515 | + "\n", |
| 516 | + "# Collect all log lines from the generator into a list\n", |
| 517 | + "logs = list(pod_logs)\n", |
| 518 | + "log_text = \"\\n\".join(str(line) for line in logs)\n", |
| 519 | + "\n", |
| 520 | + "\n", |
| 521 | + "print(f\"Training job final status: {job.status}\")\n", |
| 522 | + "\n", |
| 523 | + "# Check 1: Job status must not be \"Failed\" \n", |
| 524 | + "if job.status == \"Failed\":\n", |
| 525 | + " print(f\"ERROR: Training job '{job_name}' has Failed status\")\n", |
| 526 | + " print(\"Last 30 lines of logs:\")\n", |
| 527 | + " for line in logs[-30:]:\n", |
| 528 | + " print(line)\n", |
| 529 | + " raise RuntimeError(f\"Training job '{job_name}' failed\")\n", |
| 530 | + "\n", |
| 531 | + "# Check 2: Look for the training completion message in logs\n", |
| 532 | + "# This is critical because the training script may catch exceptions and exit 0\n", |
| 533 | + "if \"Training is finished\" not in log_text:\n", |
| 534 | + " print(f\"ERROR: Training completion message not found in logs\")\n", |
| 535 | + " print(\"Last 50 lines of logs:\")\n", |
| 536 | + " for line in logs[-50:]:\n", |
| 537 | + " print(line)\n", |
| 538 | + " raise RuntimeError(f\"Training did not complete successfully - missing completion message\")\n", |
| 539 | + "\n", |
| 540 | + "print(f\"✓ Training job '{job_name}' completed successfully\")" |
511 | 541 | ] |
512 | 542 | }, |
513 | 543 | { |
|
0 commit comments