Skip to content

Commit 8890e1a

Browse files
authored
Update Kueue and Jobset controller default limit value (#502)
* Update Kueue and Jobset controller default limit value * Update cluster.py * Split into get and update manifest * Remove dup lines * Organize code * Redesign the feature * Clean up code * Correct wrong description * Remove unnecessary section of yaml * Resolve lint issue * Reformat the change
1 parent 31dd601 commit 8890e1a

File tree

3 files changed

+282
-1
lines changed

3 files changed

+282
-1
lines changed

src/xpk/commands/cluster.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,14 @@
4242
get_gke_server_config,
4343
zone_to_region,
4444
)
45+
from ..core.jobset import update_jobset_resources_if_necessary
4546
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
4647
from ..core.kueue import (
4748
cluster_preheat_yml,
4849
install_kueue_crs,
4950
install_kueue_on_cluster,
5051
wait_for_kueue_available,
52+
update_kueue_resources_if_necessary,
5153
)
5254
from ..core.nap import enable_autoprovisioning_on_cluster
5355
from ..core.network import (
@@ -170,7 +172,6 @@ def cluster_adapt(args) -> None:
170172
install_kueue(args, system, autoprovisioning_config)
171173

172174
install_kjob(args)
173-
174175
if system.accelerator_type == AcceleratorType['GPU']:
175176
prepare_gpus(args, system)
176177

@@ -308,6 +309,9 @@ def cluster_create(args) -> None:
308309
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
309310
if set_jobset_on_cluster_code != 0:
310311
xpk_exit(set_jobset_on_cluster_code)
312+
update_jobset_resources_code = update_jobset_resources_if_necessary(args)
313+
if update_jobset_resources_code != 0:
314+
xpk_exit(update_jobset_resources_code)
311315

312316
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
313317
if set_pathways_job_on_cluster_code != 0:
@@ -957,6 +961,11 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
957961
if enable_kueue_credentials_code != 0:
958962
xpk_exit(enable_kueue_credentials_code)
959963

964+
xpk_print('Update Kueue Controller Manager resources')
965+
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
966+
if update_kueue_resources_code != 0:
967+
xpk_exit(update_kueue_resources_code)
968+
960969

961970
def prepare_gpus(args, system: SystemCharacteristics):
962971
xpk_print('Installing NCCL Plugin for cluster')

src/xpk/core/jobset.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
import math
18+
19+
from ..utils.console import xpk_exit, xpk_print
20+
from ..utils.file import write_tmp_file
21+
from ..core.kueue import (
22+
MEMORY_SIZE_PER_VM,
23+
MIN_MEMORY_LIMIT_SIZE,
24+
)
25+
from .commands import (
26+
run_command_for_value,
27+
run_command_with_updates_retry,
28+
)
29+
30+
jobset_controller_manager_yml = """
31+
apiVersion: apps/v1
32+
kind: Deployment
33+
metadata:
34+
labels:
35+
app.kubernetes.io/component: manager
36+
app.kubernetes.io/created-by: jobset
37+
app.kubernetes.io/instance: controller-manager
38+
app.kubernetes.io/managed-by: kustomize
39+
app.kubernetes.io/name: deployment
40+
app.kubernetes.io/part-of: jobset
41+
control-plane: controller-manager
42+
name: jobset-controller-manager
43+
namespace: jobset-system
44+
spec:
45+
replicas: 1
46+
selector:
47+
matchLabels:
48+
control-plane: controller-manager
49+
template:
50+
metadata:
51+
annotations:
52+
kubectl.kubernetes.io/default-container: manager
53+
labels:
54+
control-plane: controller-manager
55+
spec:
56+
containers:
57+
- args:
58+
- --config=/controller_manager_config.yaml
59+
- --zap-log-level=2
60+
command:
61+
- /manager
62+
image: registry.k8s.io/jobset/jobset:v0.8.0
63+
livenessProbe:
64+
httpGet:
65+
path: /healthz
66+
port: 8081
67+
initialDelaySeconds: 15
68+
periodSeconds: 20
69+
name: manager
70+
ports:
71+
- containerPort: 9443
72+
name: webhook-server
73+
protocol: TCP
74+
readinessProbe:
75+
httpGet:
76+
path: /readyz
77+
port: 8081
78+
initialDelaySeconds: 5
79+
periodSeconds: 10
80+
resources:
81+
limits:
82+
memory: {memory_limit_size}
83+
requests:
84+
cpu: 500m
85+
memory: 128Mi
86+
securityContext:
87+
allowPrivilegeEscalation: false
88+
capabilities:
89+
drop:
90+
- ALL
91+
volumeMounts:
92+
- mountPath: /controller_manager_config.yaml
93+
name: manager-config
94+
subPath: controller_manager_config.yaml
95+
- mountPath: /tmp/k8s-webhook-server/serving-certs
96+
name: cert
97+
readOnly: true
98+
securityContext:
99+
runAsNonRoot: true
100+
serviceAccountName: jobset-controller-manager
101+
terminationGracePeriodSeconds: 10
102+
volumes:
103+
- configMap:
104+
name: jobset-manager-config
105+
name: manager-config
106+
- name: cert
107+
secret:
108+
defaultMode: 420
109+
secretName: jobset-webhook-server-cert
110+
"""
111+
112+
113+
def update_jobset_resources_if_necessary(args):
114+
"""Update the jobset manifest to increase the resources for the jobset controller manager.
115+
116+
Args:
117+
args: user provided arguments for running the command.
118+
119+
Returns:
120+
0 if successful and 1 otherwise.
121+
"""
122+
# Get total number of nodes
123+
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
124+
return_code, out = run_command_for_value(
125+
cmd_total_node_num, 'Count total nodes', args
126+
)
127+
if return_code != 0:
128+
xpk_exit(1)
129+
# 1.2MiB per VM or 4GiB (whichever is greater).
130+
new_memory_limit = (
131+
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
132+
)
133+
yml_string = jobset_controller_manager_yml.format(
134+
memory_limit_size=new_memory_limit,
135+
)
136+
tmp = write_tmp_file(yml_string)
137+
command = f'kubectl apply -f {str(tmp.file.name)}'
138+
139+
task = 'Updating jobset Controller Manager resources'
140+
return_code = run_command_with_updates_retry(command, task, args)
141+
if return_code != 0:
142+
xpk_print(f'{task} returned ERROR {return_code}')
143+
return return_code

src/xpk/core/kueue.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from argparse import Namespace
1818

19+
import math
1920
import packaging
2021
from packaging.version import Version
2122

@@ -43,6 +44,8 @@
4344
CLUSTER_QUEUE_NAME = 'cluster-queue'
4445
LOCAL_QUEUE_NAME = 'multislice-queue'
4546
WAIT_FOR_KUEUE_TIMEOUT = '5m'
47+
MEMORY_SIZE_PER_VM = 1.2
48+
MIN_MEMORY_LIMIT_SIZE = 4096
4649

4750
packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
4851

@@ -166,6 +169,99 @@
166169
command: [ "sleep", "inf" ]
167170
"""
168171

172+
kueue_controller_manager_yml = """
173+
apiVersion: apps/v1
174+
kind: Deployment
175+
metadata:
176+
labels:
177+
app.kubernetes.io/component: controller
178+
app.kubernetes.io/name: kueue
179+
control-plane: controller-manager
180+
name: kueue-controller-manager
181+
namespace: kueue-system
182+
spec:
183+
replicas: 1
184+
selector:
185+
matchLabels:
186+
control-plane: controller-manager
187+
template:
188+
metadata:
189+
annotations:
190+
kubectl.kubernetes.io/default-container: manager
191+
labels:
192+
app.kubernetes.io/component: controller
193+
app.kubernetes.io/name: kueue
194+
control-plane: controller-manager
195+
spec:
196+
containers:
197+
- args:
198+
- --config=/controller_manager_config.yaml
199+
- --zap-log-level=2
200+
command:
201+
- /manager
202+
image: registry.k8s.io/kueue/kueue:v0.10.0
203+
imagePullPolicy: Always
204+
livenessProbe:
205+
httpGet:
206+
path: /healthz
207+
port: 8081
208+
initialDelaySeconds: 15
209+
periodSeconds: 20
210+
name: manager
211+
ports:
212+
- containerPort: 8082
213+
name: visibility
214+
protocol: TCP
215+
- containerPort: 9443
216+
name: webhook-server
217+
protocol: TCP
218+
readinessProbe:
219+
httpGet:
220+
path: /readyz
221+
port: 8081
222+
initialDelaySeconds: 5
223+
periodSeconds: 10
224+
resources:
225+
limits:
226+
cpu: 500m
227+
memory: {memory_limit_size}
228+
requests:
229+
cpu: 500m
230+
memory: 512Mi
231+
securityContext:
232+
allowPrivilegeEscalation: false
233+
volumeMounts:
234+
- mountPath: /tmp/k8s-webhook-server/serving-certs
235+
name: cert
236+
readOnly: true
237+
- mountPath: /controller_manager_config.yaml
238+
name: manager-config
239+
subPath: controller_manager_config.yaml
240+
- args:
241+
- --secure-listen-address=0.0.0.0:8443
242+
- --upstream=http://127.0.0.1:8080/
243+
- --logtostderr=true
244+
- --v=10
245+
image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0
246+
name: kube-rbac-proxy
247+
ports:
248+
- containerPort: 8443
249+
name: https
250+
protocol: TCP
251+
securityContext:
252+
runAsNonRoot: true
253+
serviceAccountName: kueue-controller-manager
254+
terminationGracePeriodSeconds: 10
255+
volumes:
256+
- name: cert
257+
secret:
258+
defaultMode: 420
259+
secretName: kueue-webhook-server-cert
260+
- configMap:
261+
name: kueue-manager-config
262+
name: manager-config
263+
"""
264+
169265

170266
def verify_kueuectl(args: Namespace) -> None:
171267
"""Verify if kueuectl is installed.
@@ -386,3 +482,36 @@ def get_kueue_covered_resources_config(
386482
total_chips=total_chips,
387483
)
388484
return config_string
485+
486+
487+
def update_kueue_resources_if_necessary(args):
488+
"""Update the kueue manifest to increase the resources for the kueue controller manager.
489+
490+
Args:
491+
args: user provided arguments for running the command.
492+
493+
Returns:
494+
0 if successful and 1 otherwise.
495+
"""
496+
# Get total number of nodes
497+
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
498+
return_code, out = run_command_for_value(
499+
cmd_total_node_num, 'Count total nodes', args
500+
)
501+
if return_code != 0:
502+
xpk_exit(1)
503+
# 1.2MiB per VM or 4GiB (whichever is greater).
504+
new_memory_limit = (
505+
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
506+
)
507+
yml_string = kueue_controller_manager_yml.format(
508+
memory_limit_size=new_memory_limit,
509+
)
510+
tmp = write_tmp_file(yml_string)
511+
command = f'kubectl apply -f {str(tmp.file.name)}'
512+
513+
task = 'Updating Kueue Controller Manager resources'
514+
return_code = run_command_with_updates_retry(command, task, args)
515+
if return_code != 0:
516+
xpk_print(f'{task} returned ERROR {return_code}')
517+
return return_code

0 commit comments

Comments
 (0)