Skip to content

Commit 7d3012e

Browse files
committed
merge with changes from main, updates to only manipulate kueue related jobs/pods
1 parent ff00705 commit 7d3012e

File tree

7 files changed

+198
-11
lines changed

7 files changed

+198
-11
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
python-batchtools
1+
# python-batchtools
22

33
## Overview
44

@@ -55,7 +55,7 @@ The br command is how to submit batchjobs. It submits code intended to run on GP
5555

5656
Here's how to use thed br command:
5757

58-
First write a CUDA program and compile it :D
58+
First write a CUDA program and compile it :D <br>
5959
Then to submit your CUDA program to the GPU node:
6060

6161
``` sh
@@ -79,9 +79,9 @@ Run without waiting for logs (for longer runs, similar to a more traditional bat
7979
``` sh
8080
batchtools br --no-wait "./cuda_program"
8181
```
82-
***WARNING***
82+
***WARNING***<br>
8383
If you run br with the --no-wait flag, it will not be cleaned up for you. You must delete it on your own by running `batchtools bd <job-name>` or `oc delete job <job-name>`
84-
But don't worry, running with --no-wait will give you a reminder to delete your jobs!
84+
But don't worry, running with --no-wait will give you a reminder to delete your jobs!<br>
8585

8686
And if you need help or want to see more flas:
8787

batchtools/bd.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@ def run(args: argparse.Namespace):
4343
print("No jobs found.")
4444
return
4545

46+
gpu_jobs = [job for job in jobs]
47+
4648
# only want to delete kueue jobs so filter for kueue jobs
47-
kueue_gpu_jobs = [job for job in jobs if is_kueue_managed_job(job)]
49+
kueue_gpu_jobs = [job for job in gpu_jobs if is_kueue_managed_job(job)]
4850

4951
if not kueue_gpu_jobs:
5052
print("No Kueue-managed GPU jobs to delete.")

batchtools/bl.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .basecommand import Command
99
from .basecommand import SubParserFactory
1010
from .helpers import pretty_print
11+
from .helpers import is_kueue_managed_pod
1112

1213

1314
class LogsCommandArgs(argparse.Namespace):
@@ -63,10 +64,15 @@ def run(args: argparse.Namespace):
6364
print(pretty_print(pod_dict[name]))
6465

6566
else:
66-
# case where user provides no args, print logs for all pods
67+
printed_any = False
6768
for name, pod in pod_dict.items():
68-
print(f"\nLogs for {name}:\n{'-' * 40}")
69-
print(pretty_print(pod))
69+
if is_kueue_managed_pod(pod):
70+
printed_any = True
71+
print(f"\nLogs for {name}:\n{'-' * 40}")
72+
print(pretty_print(pod))
73+
74+
if not printed_any:
75+
print("No Kueue-managed pods found")
7076

7177
except oc.OpenShiftPythonException as e:
7278
sys.exit(f"Error occurred while retrieving logs: {e}")

batchtools/br.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def run(args: argparse.Namespace):
232232
print(
233233
f"User specified not to wait, or not to delete, so {job_name} must be deleted by user.\n"
234234
f"You can do this by running:\n"
235-
f" bd {job_name} OR\n"
235+
f" batchtools bd {job_name} OR\n"
236236
f" oc delete job {job_name}"
237237
)
238238

batchtools/helpers.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,20 @@ def is_kueue_managed_job(job_obj) -> bool:
4949
return False
5050

5151

52-
# FOR PRINTING TIMES
5352
def fmt(x):
5453
return f"{x:.3f}s" if x is not None else "n/a"
54+
55+
56+
def is_kueue_managed_pod(pod) -> bool:
57+
try:
58+
owners = getattr(pod.model.metadata, "ownerReferences", []) or []
59+
job_owner = next((o for o in owners if o.kind == "Job"), None)
60+
if not job_owner:
61+
return False
62+
63+
job_name = job_owner.name
64+
job_obj = oc.selector(f"job/{job_name}").object()
65+
return is_kueue_managed_job(job_obj)
66+
67+
except Exception:
68+
return False

tests/test-bt.sh

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
print_line() {
2+
sleep 1
3+
for (( i=0; i<50; i++ )); do
4+
printf '-'
5+
done
6+
printf '\n'
7+
}
8+
9+
echo testing bq
10+
batchtools bq
11+
print_line
12+
13+
echo 'testing bps (verbose)'
14+
batchtools --verbose bps
15+
print_line
16+
17+
echo 'testing bps (not verbose)'
18+
batchtools bps
19+
print_line
20+
21+
echo 'testing br without gpu nowait'
22+
batchtools br --gpu none --no-wait --name mtest ‘sleep 10 && echo test’
23+
print_line
24+
25+
echo 'testing bd w/ one job, no args'
26+
batchtools bd
27+
print_line
28+
29+
echo 'testing br with gpu'
30+
cat <<EOF > hello_test.cu
31+
#include <stdio.h>
32+
33+
34+
__global__ void
35+
helloFromGPU(void)
36+
{
37+
printf("Hello from GPU\\n");
38+
}
39+
40+
41+
int main(void)
42+
{
43+
printf("Hello from CPU\\n");
44+
helloFromGPU <<< 1, 3 >>>();
45+
cudaDeviceSynchronize();
46+
cudaDeviceReset();
47+
}
48+
EOF
49+
50+
nvcc hello_test.cu -o hello
51+
52+
batchtools br './hello' | tee log.txt
53+
54+
print_line
55+
56+
echo 'testing batchtools br no command, should return error'
57+
batchtools br
58+
59+
print_line
60+
61+
echo 'testing bp, bl, bd no specified jobs'
62+
batchtools br --gpu none --no-wait 'sleep 10'
63+
64+
printf 'bp one job'
65+
printf '\n'
66+
67+
batchtools bp
68+
printf '\n'
69+
70+
printf 'bl one job'
71+
printf '\n'
72+
73+
batchtools bl
74+
printf '\n'
75+
76+
batchtools br --gpu none --no-wait 'sleep 10'
77+
78+
printf 'bp two jobs'
79+
printf '\n'
80+
81+
batchtools bp
82+
printf '\n'
83+
84+
printf 'bl two jobs'
85+
printf '\n'
86+
87+
batchtools bl
88+
89+
printf 'bd two jobs, no args'
90+
batchtools bd
91+
92+
print_line
93+
94+
echo 'testing br with context'
95+
96+
echo "Hello from CUDA file!" > input.txt
97+
98+
cat <<'EOF' > readfile.cu
99+
#include <stdio.h>
100+
#include <cuda_runtime.h>
101+
102+
__global__ void printFile(char *d) { printf("%s\n", d); }
103+
104+
int main() {
105+
FILE *f = fopen("input.txt", "r");
106+
char t[128]; fgets(t, 128, f); fclose(f);
107+
108+
char *d; cudaMalloc(&d, 128);
109+
cudaMemcpy(d, t, 128, cudaMemcpyHostToDevice);
110+
printFile<<<1,1>>>(d);
111+
cudaDeviceSynchronize();
112+
cudaFree(d);
113+
}
114+
EOF
115+
116+
nvcc readfile.cu -o readfile
117+
118+
batchtools br './readfile'
119+
print_line

tests/test_bl.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22
from unittest import mock
33
from contextlib import contextmanager
4+
import openshift_client as oc
45

56
import argparse
67
from typing import Any
@@ -41,7 +42,10 @@ def args() -> argparse.Namespace:
4142

4243
@contextmanager
4344
def patch_pods_selector(pods: list[Any]):
44-
with mock.patch("openshift_client.selector") as mock_selector:
45+
with (
46+
mock.patch("openshift_client.selector") as mock_selector,
47+
mock.patch("batchtools.bl.is_kueue_managed_pod", return_value=True),
48+
):
4549
mock_result = mock.Mock(name="result")
4650
mock_result.objects.return_value = pods
4751
mock_selector.return_value = mock_result
@@ -75,3 +79,45 @@ def test_get_logs_selected(args: argparse.Namespace, pods: list[Any], capsys):
7579
captured = capsys.readouterr()
7680
assert "Logs for pod1" in captured.out
7781
assert "Logs for pod2" not in captured.out
82+
83+
84+
def test_no_kueue_managed_pods(args: argparse.Namespace, pods: list[Any], capsys):
85+
with (
86+
patch_pods_selector(pods),
87+
mock.patch("batchtools.bl.is_kueue_managed_pod", return_value=False),
88+
):
89+
args.pod_names = []
90+
LogsCommand.run(args)
91+
captured = capsys.readouterr()
92+
assert "No Kueue-managed pods found" in captured.out
93+
94+
95+
def test_invalid_pod_name(args: argparse.Namespace, pods: list[Any], capsys):
96+
with patch_pods_selector(pods):
97+
args.pod_names = ["pod1", "does-not-exist"]
98+
LogsCommand.run(args)
99+
captured = capsys.readouterr()
100+
101+
# Valid pod still prints logs
102+
assert "Logs for pod1" in captured.out
103+
# Invalid pod hits the error path
104+
assert (
105+
"does-not-exist is not a valid pod. Logs cannot be retrieved."
106+
in captured.out
107+
)
108+
109+
110+
def test_selector_exception_exits_cleanly(args: argparse.Namespace):
111+
# Make openshift_client.selector raise the same exception type used in bl.py
112+
with mock.patch("openshift_client.selector") as mock_selector:
113+
mock_selector.side_effect = oc.OpenShiftPythonException(
114+
"test exception",
115+
mock.Mock(),
116+
)
117+
118+
with pytest.raises(SystemExit) as excinfo:
119+
LogsCommand.run(args)
120+
121+
message = str(excinfo.value)
122+
assert "Error occurred while retrieving logs:" in message
123+
assert "test exception" in message

0 commit comments

Comments
 (0)