Skip to content

Commit 133f2a4

Browse files
authored
Fix max-nodes when creating flex queued nodepool of tpus (#541)
* fix max-nodes when creating tpu dws flex queued nodepools Signed-off-by: Piotr Pawłowski <[email protected]> --------- Signed-off-by: Piotr Pawłowski <[email protected]>
1 parent cd17009 commit 133f2a4

File tree

3 files changed

+7
-4
lines changed

3 files changed

+7
-4
lines changed

src/xpk/core/capacity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def verify_reservation_exists(args) -> int:
173173

174174

175175
def get_capacity_arguments_from_capacity_type(
176-
args, capacity_type: CapacityType
176+
args, capacity_type: CapacityType, max_nodes: int
177177
) -> tuple[str, int]:
178178
"""Determine the Nodepool creation capacity arguments needed.
179179
@@ -197,7 +197,7 @@ def get_capacity_arguments_from_capacity_type(
197197
capacity_args = (
198198
' --flex-start --enable-queued-provisioning --enable-autoscaling'
199199
' --location-policy=ANY --reservation-affinity=none'
200-
' --no-enable-autorepair --max-nodes=1'
200+
f' --no-enable-autorepair --max-nodes={max_nodes}'
201201
)
202202
case CapacityType.RESERVATION:
203203
capacity_args = (

src/xpk/core/kjob.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
347347
if pod_image is None or len(pod_image) == 0:
348348
pod_image = PodTemplateDefaults.IMAGE.value
349349
working_directory = config.get(KJOB_SHELL_WORKING_DIRECTORY)
350-
xpk_print("working directory is: ", working_directory)
351350
if working_directory is None or len(working_directory) == 0:
352351
working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
353352

src/xpk/core/nodepool.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,12 @@ def run_gke_node_pool_create_command(
7777
if return_code > 0:
7878
xpk_print('Listing all reservations failed!')
7979
return_code = 1
80+
if system.accelerator_type == AcceleratorType['TPU']:
81+
max_nodes = system.vms_per_slice
82+
else:
83+
max_nodes = 1000
8084
capacity_args, return_code = get_capacity_arguments_from_capacity_type(
81-
args, capacity_type
85+
args, capacity_type, max_nodes
8286
)
8387
if return_code > 0:
8488
xpk_print('Parsing capacity arguments failed!')

0 commit comments

Comments
 (0)