Skip to content

Commit 37dfc67

Browse files
authored
[AKS] az aks nodepool add: Add GPU driver install options install and none for --gpu-driver parameter (#31106)
1 parent 59193f0 commit 37dfc67

File tree

8 files changed

+2273
-0
lines changed

8 files changed

+2273
-0
lines changed

src/azure-cli/azure/cli/command_modules/acs/_consts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@
5353
CONST_GPU_INSTANCE_PROFILE_MIG4_G = "MIG4g"
5454
CONST_GPU_INSTANCE_PROFILE_MIG7_G = "MIG7g"
5555

56+
# gpu driver install
57+
CONST_GPU_DRIVER_INSTALL = "Install"
58+
CONST_GPU_DRIVER_NONE = "None"
59+
5660
# consts for ManagedCluster
5761
# load balancer sku
5862
CONST_LOAD_BALANCER_SKU_BASIC = "basic"

src/azure-cli/azure/cli/command_modules/acs/_help.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,9 @@
16711671
- name: --if-none-match
16721672
type: string
16731673
short-summary: Set to '*' to allow a new agentpool to be created, but to prevent updating an existing agentpool. Other values will be ignored.
1674+
- name: --gpu-driver
1675+
type: string
1676+
short-summary: Whether to install driver for GPU node pool. Possible values are "Install" or "None". Default is "Install".
16741677
16751678
examples:
16761679
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.

src/azure-cli/azure/cli/command_modules/acs/_params.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G,
1717
CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G,
1818
CONST_GPU_INSTANCE_PROFILE_MIG7_G, CONST_LOAD_BALANCER_SKU_BASIC,
19+
CONST_GPU_DRIVER_INSTALL, CONST_GPU_DRIVER_NONE,
1920
CONST_LOAD_BALANCER_SKU_STANDARD, CONST_MANAGED_CLUSTER_SKU_TIER_FREE,
2021
CONST_MANAGED_CLUSTER_SKU_TIER_STANDARD, CONST_MANAGED_CLUSTER_SKU_TIER_PREMIUM,
2122
CONST_NETWORK_DATAPLANE_AZURE, CONST_NETWORK_DATAPLANE_CILIUM,
@@ -194,6 +195,11 @@
194195
CONST_GPU_INSTANCE_PROFILE_MIG7_G,
195196
]
196197

198+
gpu_driver_install_modes = [
199+
CONST_GPU_DRIVER_INSTALL,
200+
CONST_GPU_DRIVER_NONE
201+
]
202+
197203
nrg_lockdown_restriction_levels = [
198204
CONST_NRG_LOCKDOWN_RESTRICTION_LEVEL_READONLY,
199205
CONST_NRG_LOCKDOWN_RESTRICTION_LEVEL_UNRESTRICTED,
@@ -828,6 +834,7 @@ def load_arguments(self, _):
828834
c.argument('enable_secure_boot', action='store_true')
829835
c.argument("if_match")
830836
c.argument("if_none_match")
837+
c.argument('gpu_driver', arg_type=get_enum_type(gpu_driver_install_modes))
831838

832839
with self.argument_context('aks nodepool update', resource_type=ResourceType.MGMT_CONTAINERSERVICE, operation_group='agent_pools') as c:
833840
c.argument('enable_cluster_autoscaler', options_list=[

src/azure-cli/azure/cli/command_modules/acs/agentpool_decorator.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,6 +1551,35 @@ def get_if_none_match(self) -> str:
15511551
"""
15521552
return self.raw_param.get("if_none_match")
15531553

1554+
def _get_gpu_driver(self) -> Union[str, None]:
1555+
"""Obtain the value of gpu_driver.
1556+
1557+
:return: string
1558+
"""
1559+
# read the original value passed by the command
1560+
gpu_driver = self.raw_param.get("gpu_driver")
1561+
1562+
# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
1563+
if self.decorator_mode == DecoratorMode.CREATE:
1564+
if (
1565+
self.agentpool and
1566+
hasattr(self.agentpool, "gpu_profile") and # backward compatibility
1567+
self.agentpool.gpu_profile and
1568+
self.agentpool.gpu_profile.driver is not None
1569+
):
1570+
gpu_driver = self.agentpool.gpu_profile.driver
1571+
1572+
# this parameter does not need dynamic completion
1573+
# this parameter does not need validation
1574+
return gpu_driver
1575+
1576+
def get_gpu_driver(self) -> Union[str, None]:
1577+
"""Obtain the value of gpu_driver.
1578+
1579+
:return: string or None
1580+
"""
1581+
return self._get_gpu_driver()
1582+
15541583

15551584
class AKSAgentPoolAddDecorator:
15561585
def __init__(
@@ -1915,6 +1944,22 @@ def set_up_agentpool_windows_profile(self, agentpool: AgentPool) -> AgentPool:
19151944

19161945
return agentpool
19171946

1947+
def set_up_gpu_profile(self, agentpool: AgentPool) -> AgentPool:
1948+
"""Set up gpu profile for the AgentPool object.
1949+
1950+
:return: the AgentPool object
1951+
"""
1952+
self._ensure_agentpool(agentpool)
1953+
1954+
gpu_driver = self.context.get_gpu_driver()
1955+
1956+
# Construct AgentPoolGPUProfile if one of the fields has been set
1957+
if gpu_driver:
1958+
agentpool.gpu_profile = self.models.GPUProfile()
1959+
agentpool.gpu_profile.driver = gpu_driver
1960+
1961+
return agentpool
1962+
19181963
def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = False) -> AgentPool:
19191964
"""The overall controller used to construct the AgentPool profile by default.
19201965
@@ -1959,6 +2004,8 @@ def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = Fa
19592004
agentpool = self.set_up_agentpool_security_profile(agentpool)
19602005
# set up message of the day
19612006
agentpool = self.set_up_motd(agentpool)
2007+
# set up gpu profile
2008+
agentpool = self.set_up_gpu_profile(agentpool)
19622009
# restore defaults
19632010
if not bypass_restore_defaults:
19642011
agentpool = self._restore_defaults_in_agentpool(agentpool)

src/azure-cli/azure/cli/command_modules/acs/custom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2426,6 +2426,8 @@ def aks_agentpool_add(
24262426
# etag headers
24272427
if_match=None,
24282428
if_none_match=None,
2429+
# gpu driver
2430+
gpu_driver=None,
24292431
):
24302432
# DO NOT MOVE: get all the original parameters and save them as a dictionary
24312433
raw_parameters = locals()

src/azure-cli/azure/cli/command_modules/acs/tests/latest/recordings/test_aks_create_gpu_driver_flow.yaml

Lines changed: 2086 additions & 0 deletions
Large diffs are not rendered by default.

src/azure-cli/azure/cli/command_modules/acs/tests/latest/test_agentpool_decorator.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,23 @@ def common_get_node_public_ip_prefix_id(self):
690690
ctx_1.attach_agentpool(agentpool)
691691
self.assertEqual(ctx_1.get_node_public_ip_prefix_id(), "test_node_public_ip_prefix_id")
692692

693+
def common_get_gpu_driver(self):
694+
ctx_1 = AKSAgentPoolContext(
695+
self.cmd,
696+
AKSAgentPoolParamDict({"gpu_driver": None}),
697+
self.models,
698+
DecoratorMode.CREATE,
699+
self.agentpool_decorator_mode,
700+
)
701+
self.assertEqual(ctx_1.get_gpu_driver(), None)
702+
agentpool = self.create_initialized_agentpool_instance(
703+
gpu_profile=self.models.GPUProfile(
704+
driver="Install"
705+
)
706+
)
707+
ctx_1.attach_agentpool(agentpool)
708+
self.assertEqual(ctx_1.get_gpu_driver(), "Install")
709+
693710
def common_get_node_count_and_enable_cluster_autoscaler_min_max_count(
694711
self,
695712
):
@@ -1788,6 +1805,9 @@ def test_get_if_match(self):
17881805
def test_get_if_none_match(self):
17891806
self.get_if_none_match()
17901807

1808+
def test_get_gpu_driver(self):
1809+
self.common_get_gpu_driver()
1810+
17911811
class AKSAgentPoolContextManagedClusterModeTestCase(AKSAgentPoolContextCommonTestCase):
17921812
def setUp(self):
17931813
self.cli_ctx = MockCLI()
@@ -2431,6 +2451,28 @@ def common_set_up_agentpool_security_profile(self):
24312451
)
24322452
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)
24332453

2454+
def common_set_up_gpu_profile(self):
2455+
dec_1 = AKSAgentPoolAddDecorator(
2456+
self.cmd,
2457+
self.client,
2458+
{"gpu_driver": "Install"},
2459+
self.resource_type,
2460+
self.agentpool_decorator_mode,
2461+
)
2462+
# fail on passing the wrong agentpool object
2463+
with self.assertRaises(CLIInternalError):
2464+
dec_1.set_up_gpu_profile(None)
2465+
agentpool_1 = self.create_initialized_agentpool_instance(restore_defaults=False)
2466+
dec_1.context.attach_agentpool(agentpool_1)
2467+
dec_agentpool_1 = dec_1.set_up_gpu_profile(agentpool_1)
2468+
dec_agentpool_1 = self._restore_defaults_in_agentpool(dec_agentpool_1)
2469+
ground_truth_agentpool_1 = self.create_initialized_agentpool_instance(
2470+
gpu_profile=self.models.GPUProfile(
2471+
driver="Install",
2472+
)
2473+
)
2474+
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)
2475+
24342476
class AKSAgentPoolAddDecoratorStandaloneModeTestCase(AKSAgentPoolAddDecoratorCommonTestCase):
24352477
def setUp(self):
24362478
self.cli_ctx = MockCLI()
@@ -2481,6 +2523,9 @@ def test_set_up_agentpool_windows_profile(self):
24812523

24822524
def test_set_up_agentpool_security_profile(self):
24832525
self.common_set_up_agentpool_security_profile()
2526+
2527+
def test_set_up_gpu_profile(self):
2528+
self.common_set_up_gpu_profile()
24842529

24852530
def test_construct_agentpool_profile_default(self):
24862531
import inspect

src/azure-cli/azure/cli/command_modules/acs/tests/latest/test_aks_commands.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12373,3 +12373,82 @@ def test_aks_network_isolated_cluster(self, resource_group, resource_group_locat
1237312373
self.cmd("aks delete -g {resource_group} -n {aks_name_1} --yes --no-wait", checks=[self.is_empty()])
1237412374
self.cmd("aks delete -g {resource_group} -n {aks_name_2} --yes --no-wait", checks=[self.is_empty()])
1237512375
self.cmd("aks delete -g {resource_group} -n {aks_name_3} --yes --no-wait", checks=[self.is_empty()])
12376+
12377+
12378+
@AllowLargeResponse()
12379+
@AKSCustomResourceGroupPreparer(
12380+
random_name_length=17, name_prefix="clitest", location="westus3"
12381+
)
12382+
def test_aks_create_gpu_driver_flow(self, resource_group, resource_group_location):
12383+
# reset the count so in replay mode the random names will start with 0
12384+
self.test_resources_count = 0
12385+
aks_name = self.create_random_name("cliakstest", 16)
12386+
node_pool_name = self.create_random_name("c", 6)
12387+
node_pool_name_second = self.create_random_name("c", 6)
12388+
self.kwargs.update(
12389+
{
12390+
"resource_group": resource_group,
12391+
"name": aks_name,
12392+
"dns_name_prefix": self.create_random_name("cliaksdns", 16),
12393+
"location": resource_group_location,
12394+
"resource_type": "Microsoft.ContainerService/ManagedClusters",
12395+
"node_pool_name": node_pool_name,
12396+
"node_pool_name_second": node_pool_name_second,
12397+
"ssh_key_value": self.generate_ssh_keys(),
12398+
"node_vm_size": "standard_nc6s_v3"
12399+
}
12400+
)
12401+
12402+
# 1. create
12403+
create_cmd = (
12404+
"aks create --resource-group={resource_group} --name={name} --location={location} "
12405+
"--enable-managed-identity "
12406+
"--ssh-key-value={ssh_key_value} "
12407+
)
12408+
self.cmd(create_cmd, checks=[
12409+
self.check('provisioningState', 'Succeeded')
12410+
])
12411+
12412+
# 2. add nodepool with --gpu-driver none
12413+
self.cmd(
12414+
"aks nodepool add "
12415+
"--resource-group={resource_group} "
12416+
"--cluster-name={name} "
12417+
"--name={node_pool_name} "
12418+
"--node-vm-size={node_vm_size} "
12419+
"-c 1 "
12420+
"--os-type Linux "
12421+
"--gpu-driver None",
12422+
checks=[
12423+
self.check("provisioningState", "Succeeded"),
12424+
self.check("gpuProfile.driver", "None"),
12425+
],
12426+
)
12427+
12428+
# nodepool delete the second
12429+
self.cmd(
12430+
"aks nodepool delete --resource-group={resource_group} --cluster-name={name} --name={node_pool_name}",
12431+
checks=[self.is_empty()],
12432+
)
12433+
12434+
# 3. add nodepool with --gpu-driver install
12435+
self.cmd(
12436+
"aks nodepool add "
12437+
"--resource-group={resource_group} "
12438+
"--cluster-name={name} "
12439+
"--name={node_pool_name_second} "
12440+
"--node-vm-size={node_vm_size} "
12441+
"-c 1 "
12442+
"--os-type Linux "
12443+
"--gpu-driver Install",
12444+
checks=[
12445+
self.check("provisioningState", "Succeeded"),
12446+
self.check("gpuProfile.driver", "Install"),
12447+
],
12448+
)
12449+
12450+
# delete
12451+
self.cmd(
12452+
"aks delete -g {resource_group} -n {name} --yes --no-wait",
12453+
checks=[self.is_empty()],
12454+
)

0 commit comments

Comments
 (0)