Skip to content

Commit de43dab

Browse files
hubatishcopybara-github
authored andcommitted
Upgrade Karpenter & K8s version; delete cloud formation templates properly
Karpenter runs started failing, as my account had hit the max number of cloud formation templates allowed. They weren't being deleted properly because a role + corresponding instance profile cannot be deleted until they're unlinked. PiperOrigin-RevId: 825126125
1 parent fbba0c8 commit de43dab

File tree

2 files changed

+83
-21
lines changed

2 files changed

+83
-21
lines changed

perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -634,8 +634,8 @@ def GetNodeSelectors(self, machine_type: str | None = None) -> list[str]:
634634

635635

636636
_KARPENTER_NAMESPACE = 'kube-system'
637-
_KARPENTER_VERSION = '1.5.0'
638-
_DEAULT_K8S_VERSION = '1.32'
637+
_KARPENTER_VERSION = '1.8.1'
638+
_DEAULT_K8S_VERSION = '1.34'
639639

640640

641641
class EksKarpenterCluster(BaseEksCluster):
@@ -670,21 +670,27 @@ def _Create(self):
670670
'-o',
671671
template_filename,
672672
])
673-
vm_util.IssueCommand([
674-
'aws',
675-
'cloudformation',
676-
'deploy',
677-
'--stack-name',
678-
self.stack_name,
679-
'--template-file',
680-
template_filename,
681-
'--capabilities',
682-
'CAPABILITY_NAMED_IAM',
683-
'--parameter-overrides',
684-
f'ClusterName={self.name}',
685-
'--region',
686-
f'{self.region}',
687-
])
673+
# key=value format differs from other service's Key=key,Value=value format
674+
formation_tags = [f'{k}={v}' for k, v in util.MakeDefaultTags().items()]
675+
vm_util.IssueCommand(
676+
[
677+
'aws',
678+
'cloudformation',
679+
'deploy',
680+
'--stack-name',
681+
self.stack_name,
682+
'--template-file',
683+
template_filename,
684+
'--capabilities',
685+
'CAPABILITY_NAMED_IAM',
686+
'--parameter-overrides',
687+
f'ClusterName={self.name}',
688+
'--region',
689+
f'{self.region}',
690+
'--tags',
691+
]
692+
+ formation_tags,
693+
)
688694
create_json: dict[str, Any] = {
689695
'metadata': {
690696
'tags': {'karpenter.sh/discovery': self.name},
@@ -1088,15 +1094,56 @@ def _Delete(self):
10881094
self.region,
10891095
]
10901096
vm_util.IssueCommand(cmd, timeout=1800)
1091-
vm_util.IssueCommand([
1097+
1098+
def _DeleteDependencies(self):
1099+
"""Deletes the CloudFormation stack."""
1100+
super()._DeleteDependencies()
1101+
delete_stack_cmd = [
10921102
'aws',
10931103
'cloudformation',
10941104
'delete-stack',
10951105
'--stack-name',
10961106
self.stack_name,
10971107
'--region',
10981108
f'{self.region}',
1109+
]
1110+
# Start deleting the stack but likely to fail to delete this role.
1111+
vm_util.IssueCommand(delete_stack_cmd)
1112+
node_role = f'KarpenterNodeRole-{self.name}'
1113+
out, _, _ = vm_util.IssueCommand([
1114+
'aws',
1115+
'iam',
1116+
'list-instance-profiles-for-role',
1117+
'--role-name',
1118+
node_role,
1119+
'--region',
1120+
f'{self.region}',
10991121
])
1122+
profiles_json = json.loads(out)
1123+
for profile in profiles_json.get('InstanceProfiles', []):
1124+
profile_name = profile['InstanceProfileName']
1125+
vm_util.IssueCommand([
1126+
'aws',
1127+
'iam',
1128+
'remove-role-from-instance-profile',
1129+
'--instance-profile-name',
1130+
profile_name,
1131+
'--role-name',
1132+
node_role,
1133+
'--region',
1134+
f'{self.region}',
1135+
])
1136+
vm_util.IssueCommand([
1137+
'aws',
1138+
'iam',
1139+
'delete-instance-profile',
1140+
'--instance-profile-name',
1141+
profile_name,
1142+
'--region',
1143+
f'{self.region}',
1144+
])
1145+
# Finish deleting the stack after deleting the role.
1146+
vm_util.IssueCommand(delete_stack_cmd)
11001147

11011148
def _IsReady(self):
11021149
"""Returns True if cluster is running. Autopilot defaults to 0 nodes."""

tests/providers/aws/elastic_kubernetes_service_test.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from perfkitbenchmarker.providers.aws import aws_network
1313
from perfkitbenchmarker.providers.aws import elastic_kubernetes_service
1414
from perfkitbenchmarker.providers.aws import util
15+
from tests import matchers
1516
from tests import pkb_common_test_case
1617

1718

@@ -340,9 +341,13 @@ def testEksYamlCreateFull(self):
340341
)
341342
cluster = elastic_kubernetes_service.EksKarpenterCluster(EKS_SPEC)
342343
self.MockJsonRead(cluster)
343-
self.MockIssueCommand(
344-
{'create cluster': [('Cluster created', '', 0)], 'curl': [('', '', 0)]}
345-
)
344+
mock_cmd = self.MockIssueCommand({
345+
'cloudformation deploy': [
346+
('Deployed cloud-formation-template.yaml', '', 0)
347+
],
348+
'create cluster': [('Cluster created', '', 0)],
349+
'curl': [('', '', 0)],
350+
})
346351
cluster._Create()
347352
assert self.patched_read_json is not None
348353
called_json = self.patched_read_json.call_args_list[0][0][0]
@@ -376,6 +381,16 @@ def testEksYamlCreateFull(self):
376381
self.assertEqual(
377382
called_json['addons'], [{'name': 'eks-pod-identity-agent'}]
378383
)
384+
mock_cmd.func_to_mock.assert_has_calls([
385+
mock.call(
386+
matchers.HASALLOF(
387+
'cloudformation',
388+
'deploy',
389+
'benchmark=kubernetes_scale',
390+
'cloud=aws',
391+
)
392+
),
393+
])
379394

380395
def testRecursiveDictionaryUpdate(self):
381396
base = {'a': 1, 'deep': {'c': 2}}

0 commit comments

Comments
 (0)