Skip to content

Commit de990b0

Browse files
authored
Update cluster-toolkit to 1.47.0 (#416)
* update cluster toolkit to 1.47.0 * update missing ctk version in blueprint_generator.py * fix: missing 'v' * update network names for a3* workloads * fix: unittests * fix: linting errors * use develop branch of cluster-toolkit * fix linting * fix unit tests * fix integrations tests * set rdma/gvnic start indices * fix unit tests * add rdma
1 parent 2c48026 commit de990b0

File tree

9 files changed

+57
-36
lines changed

9 files changed

+57
-36
lines changed

src/xpk/commands/kjob_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from ..core.cluster import get_gpu_type_from_cluster
2020

2121

22-
def add_tcpxo_annotations(args, cmd: str) -> str:
22+
def add_tcpxo_annotations(args, cmd) -> str:
2323
tcpxo, interfaces, eth0 = get_a3mega_pod_template_annotations(args)
2424
cmd += f" --pod-template-annotation {tcpxo} \\\n"
2525
cmd += f" --pod-template-annotation {eth0} \\\n"

src/xpk/commands/workload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
limitations under the License.
1515
"""
1616

17-
from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
1817
from ..core.cluster import (
1918
create_xpk_k8s_service_account,
2019
get_cluster_credentials,
@@ -81,6 +80,7 @@
8180
wait_for_job_completion,
8281
zone_to_region,
8382
)
83+
from ..core.network import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
8484
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, storage_decorator
8585
from ..utils.console import get_user_input, xpk_exit, xpk_print
8686
from ..utils.file import write_tmp_file

src/xpk/core/blueprint/blueprint_generator.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,7 @@
3737
}
3838

3939
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
40-
cluster_toolkit_version = "v1.45.1"
41-
42-
43-
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
44-
return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
45-
46-
47-
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
48-
return [f"{cluster_name}-sub-1"] + [
49-
f"{cluster_name}-rdma-sub-{i}" for i in range(8)
50-
]
40+
cluster_toolkit_version = "develop"
5141

5242

5343
class BlueprintGeneratorOutput:
@@ -157,6 +147,11 @@ def generate_a3_mega_blueprint(
157147
"total_min_nodes": system_node_pool_min_node_count,
158148
"total_max_nodes": 1000,
159149
},
150+
"k8s_network_names": {
151+
"gvnic_prefix": f"{cluster_name}-gpunet-",
152+
"gvnic_postfix": "-subnet",
153+
"gvnic_start_index": 0,
154+
},
160155
},
161156
outputs=["instructions"],
162157
)
@@ -490,6 +485,11 @@ def generate_a3_ultra_blueprint(
490485
" alias_ip_range=[]}],"
491486
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
492487
),
488+
"k8s_network_names": {
489+
"rdma_prefix": f"{cluster_name}-rdma-sub-",
490+
"rdma_start_index": 0,
491+
"rdma_postfix": "",
492+
},
493493
},
494494
outputs=["instructions"],
495495
)

src/xpk/core/docker_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@
3030
DockerRunCommandExitCode = 135
3131
dockerBuildErrorCode = 134
3232
ctk_dockerfile_path = "Dockerfile"
33-
ctk_build_ref = "v1.45.1"
33+
ctk_build_ref = "develop"
3434
ctk_docker_image = "xpk-ctk"
3535
ctk_container_name = "xpk-ctk-container"
3636
gcloud_cfg_mount_path = "/root/.config/gcloud"
3737
working_dir_mount_path = "/out"
38-
dockerfile_gh_path = f"https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{ctk_build_ref}/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
38+
dockerfile_gh_path = "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
3939
upload_dir_name = "uploads"
4040

4141

src/xpk/core/kjob.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,40 @@
1414
limitations under the License.
1515
"""
1616

17-
from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
18-
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
1917
from argparse import Namespace
20-
import yaml
21-
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
22-
from ..utils.console import xpk_print, xpk_exit
18+
from enum import Enum
2319

24-
from ..utils import templates
20+
import yaml
2521
from kubernetes import client as k8s_client
2622
from kubernetes.client import ApiClient
2723
from kubernetes.client.rest import ApiException
28-
from .cluster import setup_k8s_env, XPK_SA, DEFAULT_NAMESPACE
29-
from .storage import get_auto_mount_storages, get_auto_mount_gcsfuse_storages
30-
from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
31-
from .config import XpkConfig, KJOB_SHELL_IMAGE, KJOB_SHELL_INTERACTIVE_COMMAND, KJOB_SHELL_WORKING_DIRECTORY, KJOB_BATCH_IMAGE, KJOB_BATCH_WORKING_DIRECTORY
32-
from .resources import get_cluster_system_characteristics, SystemCharacteristics, AcceleratorType
33-
from enum import Enum
34-
35-
from ..core.workload_decorators import tcpxo_decorator
3624

37-
from ..core.workload_decorators import rdma_decorator
25+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
26+
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator
27+
from ..utils import templates
28+
from ..utils.console import xpk_exit, xpk_print
29+
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
30+
from .commands import (
31+
run_command_for_value,
32+
run_command_with_updates,
33+
run_kubectl_apply,
34+
)
35+
from .config import (
36+
KJOB_BATCH_IMAGE,
37+
KJOB_BATCH_WORKING_DIRECTORY,
38+
KJOB_SHELL_IMAGE,
39+
KJOB_SHELL_INTERACTIVE_COMMAND,
40+
KJOB_SHELL_WORKING_DIRECTORY,
41+
XpkConfig,
42+
)
43+
from .network import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
44+
from .resources import (
45+
AcceleratorType,
46+
SystemCharacteristics,
47+
get_cluster_system_characteristics,
48+
)
49+
from .storage import get_auto_mount_gcsfuse_storages, get_auto_mount_storages
50+
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
3851

3952
KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
4053
KJOB_API_GROUP_VERSION = "v1alpha1"

src/xpk/core/tests/data/a3_mega.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
!Blueprint
1717
blueprint_name: xpk-gke-a3-megagpu
1818
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
19-
toolkit_modules_version: v1.45.1
19+
toolkit_modules_version: develop
2020

2121
vars:
2222
project_id: "foo"
@@ -65,6 +65,10 @@ deployment_groups:
6565
system_node_pool_node_count:
6666
total_min_nodes: 5
6767
total_max_nodes: 1000
68+
k8s_network_names:
69+
gvnic_prefix: "bar-gpunet-"
70+
gvnic_postfix: "-subnet"
71+
gvnic_start_index: 0
6872
outputs: [instructions]
6973
- !DeploymentModule
7074
id: group_placement_0

src/xpk/core/tests/data/a3_ultra.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
!Blueprint
1515
blueprint_name: xpk-gke-a3-ultra
1616
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
17-
toolkit_modules_version: v1.45.1
17+
toolkit_modules_version: develop
1818

1919
vars:
2020

@@ -114,6 +114,10 @@ deployment_groups:
114114
total_min_nodes: 2
115115
total_max_nodes: 1000
116116
additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
117+
k8s_network_names:
118+
rdma_prefix: "gke-a3-ultra-rdma-sub-"
119+
rdma_start_index: 0
120+
rdma_postfix: ""
117121
outputs: [instructions]
118122

119123
- !DeploymentModule

src/xpk/core/workload_decorators/rdma_decorator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def decorate_kjob_template(job_manifest) -> str:
3333
return job_manifest
3434

3535

36-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
36+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
3737
"""
3838
Decorates a JobSet manifest with the necessary components for rdma-daemon.
3939
@@ -80,7 +80,7 @@ def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
8080
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
8181

8282

83-
def add_annotations(job_manifest, sub_networks):
83+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
8484
"""Adds or updates annotations in the Pod template."""
8585
annotations = job_manifest['spec']['template']['metadata']['annotations']
8686
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)

src/xpk/core/workload_decorators/tcpxo_decorator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
5757
return job_manifest
5858

5959

60-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
60+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
6161
"""
6262
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
6363
@@ -105,7 +105,7 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
105105
)
106106

107107

108-
def add_annotations(job_manifest, sub_networks):
108+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
109109
"""Adds or updates annotations in the Pod template."""
110110
annotations = job_manifest['spec']['template']['metadata']['annotations']
111111
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()

0 commit comments

Comments
 (0)