Skip to content

Commit dcb6265

Browse files
address comments
1 parent 9a2d764 commit dcb6265

File tree

2 files changed

+19
-15
lines changed

2 files changed

+19
-15
lines changed

tests/trainer/resources/mnist.ipynb

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@
376376
" print(\"[notebook] Internet download completed successfully\")\n",
377377
" except Exception as e:\n",
378378
" print(f\"[notebook] Internet download failed: {e}\")\n",
379-
" print(\"[notebook] WARNING: Dataset may be incomplete!\")\n",
379+
" raise RuntimeError(\"Internet download failed; aborting test\")\n",
380380
"else:\n",
381381
" print(\"[notebook] Dataset files already present, skipping download\")\n"
382382
]
@@ -430,13 +430,8 @@
430430
"\n",
431431
"try:\n",
432432
" torch_runtime = client.get_runtime(\"torch-distributed\")\n",
433-
"except Exception:\n",
434-
" torch_runtime = next(\n",
435-
" (r for r in client.list_runtimes() if r.name == \"torch-distributed\"),\n",
436-
" None,\n",
437-
" )\n",
438-
" if torch_runtime is None:\n",
439-
" raise RuntimeError(\"Runtime 'torch-distributed' not found\")"
433+
"except Exception as e:\n",
434+
" raise RuntimeError(\"Runtime 'torch-distributed' not found or not accessible\") from e"
440435
]
441436
},
442437
{

tests/trainer/utils/utils_notebook.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package trainer
1818

1919
import (
2020
"fmt"
21-
"os/exec"
2221
"strings"
2322
"time"
2423

@@ -52,21 +51,31 @@ func WaitForNotebookPodRunning(test Test, namespace string) (string, string) {
5251
// PollNotebookLogsForStatus polls the notebook container logs until a definitive NOTEBOOK_STATUS line appears or timeout.
5352
func PollNotebookLogsForStatus(test Test, namespace, podName, containerName string, timeout time.Duration) error {
5453
var finalErr error
54+
55+
// Tail last N lines similar to the previous kubectl --tail
56+
var tail int64 = 2000
57+
getLogs := PodLog(test, namespace, podName, corev1.PodLogOptions{
58+
Container: containerName,
59+
TailLines: &tail,
60+
})
61+
62+
// Track failure signal while polling
63+
sawFailure := false
5564
test.Eventually(func() bool {
56-
out, err := exec.Command("kubectl", "-n", namespace, "logs", podName, "-c", containerName, "--tail=2000").CombinedOutput()
57-
if err != nil {
58-
return false
59-
}
60-
logs := string(out)
65+
logs := getLogs(test)
6166
switch {
6267
case strings.Contains(logs, "NOTEBOOK_STATUS: SUCCESS"):
6368
return true
6469
case strings.Contains(logs, "NOTEBOOK_STATUS: FAILURE"):
65-
finalErr = fmt.Errorf("Notebook execution failed")
70+
sawFailure = true
6671
return true
6772
default:
6873
return false
6974
}
7075
}, timeout).Should(BeTrue(), "Notebook did not reach definitive state")
76+
77+
if sawFailure {
78+
finalErr = fmt.Errorf("Notebook execution failed")
79+
}
7180
return finalErr
7281
}

0 commit comments

Comments
 (0)