Skip to content

Commit 43c7157

Browse files
authored
Fix tcpx_decorator (#616)
* Fix tcpx_decorator Without this change getting errors of missing volume when trying to create A3 clusters * Add unit tests for test_tcpx_decorator Also change code at the module level in main.py to follow idiomatic Python usage (https://docs.python.org/3/library/__main__.html#idiomatic-usage) that allows for unit testing * Fix linter * Fix formatting with pyink
1 parent 5195c31 commit 43c7157

File tree

5 files changed

+307
-10
lines changed

5 files changed

+307
-10
lines changed

src/xpk/core/workload_decorators/tcpx_decorator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def add_volumes(job_manifest: dict):
131131
})
132132
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133133
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134+
volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
134135
volumes.append(
135136
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136137
)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
import copy
18+
19+
import yaml
20+
21+
from xpk.core.workload_decorators import tcpx_decorator
22+
from xpk.utils.yaml import literal_string
23+
24+
# Minimal JobSet manifest for testing
25+
BASE_JOBSET_MANIFEST_STR = """
26+
apiVersion: jobset.x-k8s.io/v1alpha2
27+
kind: JobSet
28+
metadata:
29+
name: test-jobset
30+
spec:
31+
replicatedJobs:
32+
- name: slice-job
33+
template:
34+
spec:
35+
template:
36+
metadata:
37+
annotations:
38+
existing-annotation: "true"
39+
spec:
40+
containers:
41+
- name: main-gpu-container
42+
image: my-gpu-image
43+
resources:
44+
limits:
45+
nvidia.com/gpu: 8
46+
- name: sidecar-container
47+
image: my-sidecar-image
48+
"""
49+
50+
# Minimal kjob template for testing
51+
BASE_KJOB_TEMPLATE = {
52+
"spec": {
53+
"template": {
54+
"spec": {
55+
"containers": [
56+
{
57+
"name": "main-gpu-container",
58+
"image": "my-gpu-image",
59+
"resources": {"limits": {"nvidia.com/gpu": 8}},
60+
},
61+
{"name": "sidecar-container", "image": "my-sidecar-image"},
62+
]
63+
}
64+
}
65+
}
66+
}
67+
68+
# Minimal job manifest for testing
69+
BASE_JOB_MANIFEST = {
70+
"spec": {
71+
"template": {
72+
"metadata": {"annotations": {"existing-annotation": "true"}},
73+
"spec": {
74+
"containers": [
75+
{
76+
"name": "main-gpu-container",
77+
"image": "my-gpu-image",
78+
"resources": {"limits": {"nvidia.com/gpu": 8}},
79+
},
80+
{"name": "sidecar-container", "image": "my-sidecar-image"},
81+
]
82+
},
83+
}
84+
}
85+
}
86+
87+
88+
def test_get_interfaces_annotation():
89+
"""Tests get_interfaces_annotation."""
90+
annotation = tcpx_decorator.get_interfaces_annotation()
91+
assert "networking.gke.io/interfaces" in annotation
92+
assert isinstance(annotation["networking.gke.io/interfaces"], literal_string)
93+
expected_value = (
94+
"[\n"
95+
' {"interfaceName":"eth0","network":"default"},\n'
96+
' {"interfaceName":"eth1","network":"vpc1"},\n'
97+
' {"interfaceName":"eth2","network":"vpc2"},\n'
98+
' {"interfaceName":"eth3","network":"vpc3"},\n'
99+
' {"interfaceName":"eth4","network":"vpc4"}\n'
100+
"]"
101+
)
102+
assert str(annotation["networking.gke.io/interfaces"]) == expected_value
103+
104+
105+
def test_get_tcpx_deamon_annotation():
106+
"""Tests get_tcpx_deamon_annotation."""
107+
annotation = tcpx_decorator.get_tcpx_deamon_annotation()
108+
assert "devices.gke.io/container.tcpx-daemon" in annotation
109+
assert isinstance(
110+
annotation["devices.gke.io/container.tcpx-daemon"], literal_string
111+
)
112+
expected_value = (
113+
"- path: /dev/nvidia0\n"
114+
"- path: /dev/nvidia1\n"
115+
"- path: /dev/nvidia2\n"
116+
"- path: /dev/nvidia3\n"
117+
"- path: /dev/nvidia4\n"
118+
"- path: /dev/nvidia5\n"
119+
"- path: /dev/nvidia6\n"
120+
"- path: /dev/nvidia7\n"
121+
"- path: /dev/nvidiactl\n"
122+
"- path: /dev/nvidia-uvm\n"
123+
)
124+
assert (
125+
str(annotation["devices.gke.io/container.tcpx-daemon"]) == expected_value
126+
)
127+
128+
129+
def test_decorate_jobset():
130+
"""Tests decorate_jobset."""
131+
decorated_str = tcpx_decorator.decorate_jobset(BASE_JOBSET_MANIFEST_STR)
132+
manifest = yaml.safe_load(decorated_str)
133+
134+
pod_template_spec = manifest["spec"]["replicatedJobs"][0]["template"]["spec"][
135+
"template"
136+
]["spec"]
137+
pod_template_metadata = manifest["spec"]["replicatedJobs"][0]["template"][
138+
"spec"
139+
]["template"]["metadata"]
140+
141+
# Check annotations
142+
annotations = pod_template_metadata["annotations"]
143+
assert "existing-annotation" in annotations
144+
assert "devices.gke.io/container.tcpx-daemon" in annotations
145+
assert "networking.gke.io/default-interface" in annotations
146+
assert "networking.gke.io/interfaces" in annotations
147+
148+
# Check tolerations
149+
tolerations = pod_template_spec["tolerations"]
150+
assert {
151+
"key": "user-workload",
152+
"operator": "Equal",
153+
"value": "true",
154+
"effect": "NoSchedule",
155+
} in tolerations
156+
157+
# Check volumes
158+
volumes = pod_template_spec["volumes"]
159+
volume_names = {v["name"] for v in volumes}
160+
assert "libraries" in volume_names
161+
assert "sys" in volume_names
162+
assert "proc-sys" in volume_names
163+
assert "tcpx-socket" in volume_names
164+
assert "dshm" in volume_names
165+
166+
# Check init container
167+
init_containers = pod_template_spec["initContainers"]
168+
assert len(init_containers) == 1
169+
tcpx_daemon = init_containers[0]
170+
assert tcpx_daemon["name"] == "tcpx-daemon"
171+
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
172+
173+
# Check GPU container update
174+
gpu_container = pod_template_spec["containers"][0]
175+
assert gpu_container["name"] == "main-gpu-container"
176+
177+
# Check env
178+
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
179+
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
180+
181+
# Check volume mounts
182+
volume_mounts = {
183+
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
184+
}
185+
assert volume_mounts["tcpx-socket"] == "/tmp"
186+
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
187+
assert volume_mounts["dshm"] == "/dev/shm"
188+
189+
# Check non-GPU container is not updated
190+
sidecar_container = pod_template_spec["containers"][1]
191+
assert "env" not in sidecar_container
192+
assert "volumeMounts" not in sidecar_container
193+
194+
195+
def test_decorate_job():
196+
"""Tests decorate_job."""
197+
job_manifest = copy.deepcopy(BASE_JOB_MANIFEST)
198+
199+
decorated_manifest = tcpx_decorator.decorate_job(job_manifest)
200+
pod_template_metadata = decorated_manifest["spec"]["template"]["metadata"]
201+
202+
# Check annotations
203+
annotations = pod_template_metadata["annotations"]
204+
assert "existing-annotation" in annotations
205+
assert "devices.gke.io/container.tcpx-daemon" in annotations
206+
assert "networking.gke.io/default-interface" in annotations
207+
assert "networking.gke.io/interfaces" in annotations
208+
209+
210+
def test_decorate_kjob_template():
211+
"""Tests decorate_kjob_template."""
212+
kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
213+
214+
decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
215+
216+
pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
217+
218+
# Check annotations are NOT added
219+
assert "annotations" not in decorated_manifest["spec"]["template"].get(
220+
"metadata", {}
221+
)
222+
223+
# Check tolerations
224+
tolerations = pod_template_spec["tolerations"]
225+
assert {
226+
"key": "user-workload",
227+
"operator": "Equal",
228+
"value": "true",
229+
"effect": "NoSchedule",
230+
} in tolerations
231+
232+
# Check volumes
233+
volumes = pod_template_spec["volumes"]
234+
volume_names = {v["name"] for v in volumes}
235+
assert "libraries" in volume_names
236+
assert "sys" in volume_names
237+
assert "proc-sys" in volume_names
238+
assert "tcpx-socket" in volume_names
239+
assert "dshm" in volume_names
240+
241+
# Check init container
242+
init_containers = pod_template_spec["initContainers"]
243+
assert len(init_containers) == 1
244+
tcpx_daemon = init_containers[0]
245+
assert tcpx_daemon["name"] == "tcpx-daemon"
246+
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
247+
248+
# Check GPU container update
249+
gpu_container = pod_template_spec["containers"][0]
250+
assert gpu_container["name"] == "main-gpu-container"
251+
252+
# Check env
253+
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
254+
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
255+
256+
# Check volume mounts
257+
volume_mounts = {
258+
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
259+
}
260+
assert volume_mounts["tcpx-socket"] == "/tmp"
261+
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
262+
assert volume_mounts["dshm"] == "/dev/shm"
263+
264+
# Check non-GPU container is not updated
265+
sidecar_container = pod_template_spec["containers"][1]
266+
assert "env" not in sidecar_container
267+
assert "volumeMounts" not in sidecar_container

src/xpk/main.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,17 @@
5656
f' User currently is running {user_major_version}.{user_minor_version}'
5757
)
5858

59-
# Create top level parser for xpk command.
60-
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
61-
set_parser(parser=parser)
62-
63-
xpk_print('Starting xpk', flush=True)
64-
validate_dependencies()
65-
main_args = parser.parse_args()
66-
main_args.enable_ray_cluster = False
67-
main_args.func(main_args)
68-
6959

7060
def main() -> None:
61+
# Create top level parser for xpk command.
62+
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
63+
set_parser(parser=parser)
64+
65+
xpk_print('Starting xpk', flush=True)
66+
validate_dependencies()
67+
main_args = parser.parse_args()
68+
main_args.enable_ray_cluster = False
69+
main_args.func(main_args)
7170
xpk_print('XPK Done.', flush=True)
7271

7372

0 commit comments

Comments
 (0)