Skip to content

Commit 34d0428

Browse files
authored
Validate kueue subslice constraints in workload create (#731)
feat: validate kueue subslice constraints in workload create
1 parent 02606b4 commit 34d0428

File tree

2 files changed

+128
-4
lines changed

2 files changed

+128
-4
lines changed

src/xpk/commands/workload.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
get_cluster_credentials,
2727
setup_k8s_env,
2828
)
29-
from ..core.commands import run_command_with_updates, run_commands
29+
from ..core.commands import run_command_with_updates, run_commands, run_command_for_value
30+
from ..core.kueue_manager import KueueManager, SUB_SLICE_TOPOLOGY_NAME
3031
from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
3132
from ..core.docker_container import (
3233
get_main_container_docker_image,
@@ -95,6 +96,7 @@
9596
tcpxo_decorator,
9697
)
9798
from ..utils.console import get_user_input, xpk_exit, xpk_print
99+
from packaging.version import Version
98100
from ..utils.file import write_tmp_file
99101
from ..utils.execution_context import is_dry_run
100102
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
@@ -283,6 +285,7 @@
283285
"""
284286

285287
SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
288+
SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
286289

287290

288291
def workload_create_pathways(args) -> None:
@@ -340,6 +343,7 @@ def workload_create(args) -> None:
340343
xpk_exit(return_code)
341344

342345
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
346+
_validate_sub_slicing_availability()
343347
_validate_sub_slicing_topology(system, args.sub_slicing_topology)
344348

345349
if not check_if_workload_can_schedule(args, system):
@@ -678,6 +682,43 @@ def workload_create(args) -> None:
678682
xpk_exit(0)
679683

680684

685+
def _validate_sub_slicing_availability():
686+
return_code, value = run_command_for_value(
687+
command='kubectl get topology', task='Get defined topologies'
688+
)
689+
690+
if return_code != 0:
691+
xpk_print(
692+
'Error: Unable to validate sub-slicing support on a given cluster.'
693+
)
694+
xpk_exit(1)
695+
696+
if SUB_SLICE_TOPOLOGY_NAME not in value:
697+
xpk_print(
698+
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
699+
' --sub-slicing in "cluster create" command first.'
700+
)
701+
xpk_exit(1)
702+
703+
kueue_manager = KueueManager()
704+
return_code, current_version = kueue_manager.get_installed_kueue_version()
705+
if return_code != 0:
706+
xpk_print(
707+
'Error: Unable to validate sub-slicing support on a given cluster.'
708+
)
709+
xpk_exit(1)
710+
711+
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
712+
xpk_print(
713+
"Error: Current Kueue version ({current_version}) doesn't support"
714+
' Sub-slicing. The minimal required version is'
715+
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
716+
' manually, or run "cluster create --sub-slicing" on the existing'
717+
' cluster.'
718+
)
719+
xpk_exit(1)
720+
721+
681722
def _validate_sub_slicing_topology(
682723
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
683724
) -> None:

src/xpk/commands/workload_test.py

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from unittest.mock import MagicMock, patch
1919
import pytest
2020
from ..core.system_characteristics import SystemCharacteristics
21-
from .workload import _validate_sub_slicing_topology
21+
from .workload import _validate_sub_slicing_topology, _validate_sub_slicing_availability
22+
from packaging.version import Version
2223

2324

2425
SYSTEM_CHARACTERISTICS = SystemCharacteristics(
@@ -40,7 +41,7 @@ def xpk_print(mocker):
4041

4142

4243
def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
43-
xpk_print,
44+
xpk_print: MagicMock,
4445
):
4546
with pytest.raises(SystemExit):
4647
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
@@ -50,7 +51,9 @@ def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
5051
)
5152

5253

53-
def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
54+
def test_validate_sub_slicing_topology_exits_for_too_large_topology(
55+
xpk_print: MagicMock,
56+
):
5457
with pytest.raises(SystemExit):
5558
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
5659

@@ -64,6 +67,86 @@ def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
6467
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
6568

6669

70+
def test_validate_sub_slicing_availability_exits_when_getting_topologies_fails(
71+
xpk_print: MagicMock, mocker
72+
):
73+
mocker.patch(
74+
'xpk.commands.workload.run_command_for_value',
75+
return_value=(1, ''),
76+
)
77+
with pytest.raises(SystemExit):
78+
_validate_sub_slicing_availability()
79+
80+
assert (
81+
'Unable to validate sub-slicing support'
82+
in xpk_print.mock_calls[0].args[0]
83+
)
84+
85+
86+
def test_validate_sub_slicing_availability_exits_when_subslicing_topology_is_not_defined(
87+
xpk_print: MagicMock, mocker
88+
):
89+
mocker.patch(
90+
'xpk.commands.workload.run_command_for_value',
91+
return_value=(0, ''),
92+
)
93+
with pytest.raises(SystemExit):
94+
_validate_sub_slicing_availability()
95+
96+
assert (
97+
'Cluster has not been not set up for Sub-slicing.'
98+
in xpk_print.mock_calls[0].args[0]
99+
)
100+
101+
102+
def test_validate_sub_slicing_availability_exits_when_kueue_version_cannot_be_determined(
103+
xpk_print: MagicMock, mocker
104+
):
105+
mocker.patch(
106+
'xpk.commands.workload.run_command_for_value',
107+
return_value=(0, 'sub-slice-topology'),
108+
)
109+
mocker.patch(
110+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
111+
return_value=(1, None),
112+
)
113+
with pytest.raises(SystemExit):
114+
_validate_sub_slicing_availability()
115+
116+
assert 'Unable to validate sub-slicing' in xpk_print.mock_calls[0].args[0]
117+
118+
119+
def test_validate_sub_slicing_availability_exits_when_kueue_version_does_not_meet_minimum_requirements(
120+
xpk_print: MagicMock, mocker
121+
):
122+
mocker.patch(
123+
'xpk.commands.workload.run_command_for_value',
124+
return_value=(0, 'sub-slice-topology'),
125+
)
126+
mocker.patch(
127+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
128+
return_value=(0, Version('0.0.0')),
129+
)
130+
with pytest.raises(SystemExit):
131+
_validate_sub_slicing_availability()
132+
133+
assert 'The minimal required version is' in xpk_print.mock_calls[0].args[0]
134+
135+
136+
def test_validate_sub_slicing_availability_does_nothing_when_cluster_is_correctly_configured_for_subslicing(
137+
mocker,
138+
):
139+
mocker.patch(
140+
'xpk.commands.workload.run_command_for_value',
141+
return_value=(0, 'sub-slice-topology'),
142+
)
143+
mocker.patch(
144+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
145+
return_value=(0, Version('0.13.0')),
146+
)
147+
_validate_sub_slicing_availability()
148+
149+
67150
@patch('xpk.commands.common.xpk_print')
68151
def test_validate_sub_slicing_topology_fails_for_unsupported_system(
69152
common_xpk_print: MagicMock,

0 commit comments

Comments
 (0)