Skip to content

Commit b4d2fba

Browse files
authored
Merge branch 'develop' into chzheng/docker_image_flag
2 parents 605e7f1 + fd613de commit b4d2fba

File tree

4 files changed

+129
-7
lines changed

4 files changed

+129
-7
lines changed

src/xpk/commands/cluster.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,9 @@ def cluster_create_ray_cluster(args) -> None:
710710

711711
def install_jq(args):
712712
"""Installs 'jq' utility."""
713+
if shutil.which('jq'):
714+
xpk_print("Task: 'Install jq' skipped, jq already installed.")
715+
return
713716
command_jq_install = 'sudo apt install jq -y'
714717
xpk_print("Task: 'Install jq' in progress.")
715718
return_code = run_command_with_updates(command_jq_install, 'Install jq', args)

src/xpk/core/nap.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
management:
4343
autoRepair: true
4444
autoUpgrade: true
45+
scopes:
46+
- "https://www.googleapis.com/auth/devstorage.read_write"
4547
autoprovisioningLocations:
4648
{zones}
4749
{resource_limits}
@@ -183,11 +185,11 @@ def create_autoprovisioning_config(
183185
# is not controlled by NAP.
184186
cpu_limits = """
185187
minimum: 1
186-
maximum: 10000
188+
maximum: 1000000
187189
"""
188190
memory_limits = """
189191
minimum: 1
190-
maximum: 10000
192+
maximum: 10000000
191193
"""
192194

193195
# By default, the maximum chips is set to be the current number of resources used

src/xpk/core/nodepool.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
create_or_update_cluster_configmap,
3333
)
3434
from .system_characteristics import AcceleratorType
35+
from functools import reduce
36+
from operator import mul
3537

3638
CLOUD_PLATFORM_AUTH_SCOPE_URL = (
3739
'"https://www.googleapis.com/auth/cloud-platform"'
@@ -275,20 +277,24 @@ def run_gke_node_pool_create_command(
275277
f' --host-maintenance-interval={args.host_maintenance_interval}'
276278
f' {capacity_args}'
277279
' --enable-gvnic'
278-
f' {args.custom_nodepool_arguments}'
279280
)
280281
if system.accelerator_type == AcceleratorType['TPU']:
281282
command += f' --node-version={gke_node_pool_version}'
283+
topology_product = reduce(
284+
mul, (int(x) for x in system.topology.split('x')), 1
285+
)
282286
if capacity_type == CapacityType.FLEX_START:
283287
command += ' --num-nodes=0'
284-
else:
288+
elif topology_product > 1:
285289
command += f' --num-nodes={system.vms_per_slice}'
286-
command += ' --placement-type=COMPACT --max-pods-per-node 15'
287290
command += (
288291
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
289292
)
290-
command += f' --tpu-topology={system.topology}'
291-
command += f' {args.custom_tpu_nodepool_arguments}'
293+
294+
if topology_product > 1:
295+
command += ' --placement-type=COMPACT --max-pods-per-node 15'
296+
command += f' --tpu-topology={system.topology}'
297+
command += f' {args.custom_tpu_nodepool_arguments}'
292298
elif system.accelerator_type == AcceleratorType['GPU']:
293299
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
294300
if capacity_type == CapacityType.FLEX_START:
@@ -319,6 +325,8 @@ def run_gke_node_pool_create_command(
319325
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
320326
command += ' --workload-metadata=GKE_METADATA'
321327

328+
command += f' {args.custom_nodepool_arguments}'
329+
322330
task = f'NodepoolCreate-{node_pool_name}'
323331
create_commands.append(command)
324332
create_task_names.append(task)

src/xpk/core/system_characteristics.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,115 @@ def get_tpu_system_characteristics_map(
242242
'h100-mega-80gb-8',
243243
),
244244
# TPU system characteristics
245+
**get_tpu_system_characteristics_map(
246+
'tpu7x', 2, 'tpu7x', 'tpu7x-standard-1t', ['1x1x1']
247+
),
248+
**get_tpu_system_characteristics_map(
249+
'tpu7x',
250+
2,
251+
'tpu7x',
252+
'tpu7x-standard-4t',
253+
[
254+
'12x12x12',
255+
'12x12x16',
256+
'12x12x20',
257+
'12x12x24',
258+
'12x12x28',
259+
'12x12x36',
260+
'12x16x16',
261+
'12x16x20',
262+
'12x16x24',
263+
'12x16x28',
264+
'12x20x20',
265+
'12x20x24',
266+
'12x24x24',
267+
'16x16x16',
268+
'16x16x20',
269+
'16x16x24',
270+
'16x16x32',
271+
'16x20x28',
272+
'16x24x24',
273+
'2x2x1',
274+
'2x2x2',
275+
'2x2x4',
276+
'2x4x4',
277+
'4x12x116',
278+
'4x12x12',
279+
'4x12x124',
280+
'4x12x20',
281+
'4x12x28',
282+
'4x12x44',
283+
'4x12x52',
284+
'4x12x68',
285+
'4x12x76',
286+
'4x12x92',
287+
'4x20x20',
288+
'4x20x28',
289+
'4x20x44',
290+
'4x20x52',
291+
'4x20x68',
292+
'4x20x76',
293+
'4x28x28',
294+
'4x28x44',
295+
'4x28x52',
296+
'4x4x116',
297+
'4x4x12',
298+
'4x4x124',
299+
'4x4x148',
300+
'4x4x164',
301+
'4x4x172',
302+
'4x4x188',
303+
'4x4x20',
304+
'4x4x212',
305+
'4x4x236',
306+
'4x4x244',
307+
'4x4x28',
308+
'4x4x4',
309+
'4x4x44',
310+
'4x4x52',
311+
'4x4x68',
312+
'4x4x76',
313+
'4x4x8',
314+
'4x4x92',
315+
'4x8x116',
316+
'4x8x12',
317+
'4x8x124',
318+
'4x8x148',
319+
'4x8x164',
320+
'4x8x172',
321+
'4x8x188',
322+
'4x8x20',
323+
'4x8x28',
324+
'4x8x44',
325+
'4x8x52',
326+
'4x8x68',
327+
'4x8x76',
328+
'4x8x8',
329+
'4x8x92',
330+
'8x12x12',
331+
'8x12x16',
332+
'8x12x20',
333+
'8x12x28',
334+
'8x12x44',
335+
'8x12x52',
336+
'8x16x16',
337+
'8x16x20',
338+
'8x16x28',
339+
'8x16x44',
340+
'8x20x20',
341+
'8x20x28',
342+
'8x8x12',
343+
'8x8x16',
344+
'8x8x20',
345+
'8x8x28',
346+
'8x8x44',
347+
'8x8x52',
348+
'8x8x68',
349+
'8x8x76',
350+
'8x8x8',
351+
'8x8x92',
352+
],
353+
),
245354
**get_tpu_system_characteristics_map(
246355
'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1']
247356
),

0 commit comments

Comments
 (0)