Skip to content

Commit 813afa0

Browse files
authored
Merge branch 'develop' into chzheng/docker_image_flag
2 parents 88f97a9 + 0434cf6 commit 813afa0

File tree

15 files changed

+422
-66
lines changed

15 files changed

+422
-66
lines changed

src/xpk/commands/cluster.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,6 @@ def cluster_adapt(args) -> None:
141141
if not tensorboard_config:
142142
xpk_exit(1)
143143

144-
# Provision node pools dynamically based on incoming workloads:
145-
# Currently autoprovisioning is not supported with Pathways.
146144
autoprovisioning_config = None
147145
if args.enable_autoprovisioning:
148146
xpk_print('Enabling Autoprovisioning')
@@ -294,7 +292,7 @@ def cluster_create(args) -> None:
294292
# Provision node pools dynamically based on incoming workloads:
295293
# Currently autoprovisioning is not supported with Pathways.
296294
autoprovisioning_config = None
297-
if not args.enable_pathways and args.enable_autoprovisioning:
295+
if args.enable_autoprovisioning:
298296
xpk_print('Enabling Autoprovisioning')
299297
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
300298
args, system
@@ -819,7 +817,7 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
819817

820818

821819
def verify_coredns_readiness(
822-
args, timeout: int = 120, namespace: str = 'kube-system'
820+
args, timeout: int = 240, namespace: str = 'kube-system'
823821
):
824822
"""Verifies CoreDNS readiness using kubectl wait commands."""
825823
xpk_print('Now verifying CoreDNS readiness...')

src/xpk/commands/workload.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
from ..core.vertex import create_vertex_experiment
8585
from ..core.workload import (
8686
check_if_workload_exists,
87+
get_jobsets_list_gcp_link,
8788
get_workload_list,
8889
wait_for_job_completion,
8990
zone_to_region,
@@ -761,4 +762,8 @@ def workload_list(args) -> None:
761762
xpk_print(f'List Job request returned ERROR {return_code}')
762763
xpk_exit(return_code)
763764
xpk_print(f'Workload List Output:\n{return_value}')
765+
766+
workload_list_gcp_link = get_jobsets_list_gcp_link(project=args.project)
767+
xpk_print(f'See your workloads in Cloud Console: {workload_list_gcp_link}')
768+
764769
xpk_exit(0)

src/xpk/core/cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
6262
0 if successful and 1 otherwise.
6363
"""
6464
command = (
65-
'kubectl apply --server-side -f'
66-
f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
65+
'kubectl apply --server-side --force-conflicts'
66+
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
6767
)
6868
task = f'Install Jobset on {args.cluster}'
6969
return_code = run_command_with_updates_retry(command, task, args)

src/xpk/core/kueue.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
KUEUE_VERSION = 'v0.12.2'
4444
CLUSTER_QUEUE_NAME = 'cluster-queue'
4545
LOCAL_QUEUE_NAME = 'multislice-queue'
46-
WAIT_FOR_KUEUE_TIMEOUT = '5m'
46+
WAIT_FOR_KUEUE_TIMEOUT = '10m'
4747
MEMORY_SIZE_PER_VM = 1.2
4848
MIN_MEMORY_LIMIT_SIZE = 4096
4949

src/xpk/core/nap.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -269,9 +269,6 @@ def is_autoprovisioning_enabled(
269269
bool is true if autoprovisioning is enabled, false otherwise.
270270
int of 0 if successful and 1 otherwise.
271271
"""
272-
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
273-
if args.use_pathways:
274-
return False, 0
275272

276273
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
277274
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)

src/xpk/core/resources.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,7 @@ def create_cluster_configmaps(
108108
device_type = system.device_type
109109
if system.accelerator_type == AcceleratorType['GPU']:
110110
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
111-
elif (
112-
not args.enable_pathways
113-
and args.enable_autoprovisioning
114-
and autoprovisioning_config
115-
):
116-
# Currently autoprovisioning is not supported with Pathways.
117-
# Auto provisioning will have variable topologies for a gke accelerator type.
111+
elif args.enable_autoprovisioning and autoprovisioning_config:
118112
resources_data = (
119113
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
120114
)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
Copyright 2025 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
from xpk.core.workload import get_jobsets_list_gcp_link
18+
19+
20+
def test_get_jobsets_list_gcp_link():
21+
result = get_jobsets_list_gcp_link(
22+
project='test-project',
23+
)
24+
25+
assert (
26+
result
27+
== 'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=test-project'
28+
)

src/xpk/core/workload.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
limitations under the License.
1515
"""
1616

17+
import re
1718
from ..utils.console import xpk_exit, xpk_print
1819
from .commands import run_command_for_value
1920
from .gcloud_context import zone_to_region
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
240241
xpk_print('Your workload did not complete successfully')
241242
return 125
242243
return 0
244+
245+
246+
GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
247+
"""Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
248+
249+
250+
def get_jobsets_list_gcp_link(project: str) -> str:
251+
"""Returns a link to Cloud Console JobSets list"""
252+
253+
return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'

src/xpk/core/workload_decorators/tcpx_decorator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def add_volumes(job_manifest: dict):
131131
})
132132
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133133
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134+
volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
134135
volumes.append(
135136
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136137
)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""

0 commit comments

Comments
 (0)