Skip to content

Commit baf2761

Browse files
hubatishcopybara-github
authored andcommitted
Always scale to 1 first before scaling larger
This standardizes performance of GKE Standard (& AKS Standard) vs GKE, EKS, AKS Auto & EKS Karpenter. The Standard variants all spin up the first pod on their default nodepool (which matches the requested machine type), whereas the Auto ones spin up starting from 0. PiperOrigin-RevId: 820411378
1 parent 0fecc30 commit baf2761

File tree

4 files changed

+38
-17
lines changed

4 files changed

+38
-17
lines changed

perfkitbenchmarker/container_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,7 @@ def _ParseApplyOutput(stdout: str) -> Iterator[str]:
10051005
"""Parses the output of kubectl apply to get the name of the resource."""
10061006
# Example input: deployment.apps/pkb123 created
10071007
for line in stdout.splitlines():
1008-
match = re.search(r'([^\s/]+/[^\s/]+) created', line)
1008+
match = re.search(r'([^\s/]+/[^\s/]+) (created|configured)', line)
10091009
if match:
10101010
yield match.group(1)
10111011

perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,27 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
123123
cluster = bm_spec.container_cluster
124124
assert isinstance(cluster, container_service.KubernetesCluster)
125125

126+
# Warm up the cluster by creating a single pod. This compensates for
127+
# differences between Standard & Autopilot, where Standard already has 1 node
128+
# due to its starting nodepool but Autopilot does not.
129+
scale_one_samples, _ = ScaleUpPods(cluster, 1)
130+
if not scale_one_samples:
131+
logging.exception(
132+
'Failed to scale up to 1 pod; now investigating failure reasons.'
133+
)
134+
unused = 0
135+
pod_samples = ParseStatusChanges('pod', unused)
136+
# Log & check for quota failure.
137+
_CheckForFailures(cluster, pod_samples, 1)
138+
126139
initial_nodes = set(cluster.GetNodeNames())
140+
initial_pods = set(cluster.GetPodNames())
127141

128-
samples, rollout_name = ScaleUpPods(cluster)
142+
samples, rollout_name = ScaleUpPods(cluster, NUM_PODS.value)
129143
start_time = _GetRolloutCreationTime(rollout_name)
130-
pod_samples = ParseStatusChanges('pod', start_time)
144+
pod_samples = ParseStatusChanges('pod', start_time, initial_pods)
131145
samples += pod_samples
132-
_CheckForFailures(cluster, pod_samples)
146+
_CheckForFailures(cluster, pod_samples, NUM_PODS.value - 1)
133147
samples += ParseStatusChanges(
134148
'node', start_time, resources_to_ignore=initial_nodes
135149
)
@@ -149,6 +163,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
149163

150164
def ScaleUpPods(
151165
cluster: container_service.KubernetesCluster,
166+
num_new_pods: int,
152167
) -> tuple[list[sample.Sample], str]:
153168
"""Scales up pods on a kubernetes cluster. Returns samples & rollout name."""
154169
samples = []
@@ -162,7 +177,6 @@ def ScaleUpPods(
162177
command = ['sh', '-c', 'sleep infinity']
163178

164179
# Request X new pods via YAML apply.
165-
num_new_pods = NUM_PODS.value
166180
max_wait_time = _GetScaleTimeout()
167181
resource_names = cluster.ApplyManifest(
168182
MANIFEST_TEMPLATE,
@@ -182,7 +196,6 @@ def ScaleUpPods(
182196
cloud='Azure' if FLAGS.cloud == 'Azure' else None,
183197
)
184198

185-
# Arbitrarily pick the first resource (it should be the only one.)
186199
assert resource_names
187200
rollout_name = next(resource_names)
188201

@@ -217,10 +230,12 @@ def ScaleUpPods(
217230
errors.VmUtil.IssueCommandTimeoutError,
218231
vm_util.TimeoutExceededRetryError,
219232
) as e:
220-
logging.warning(
221-
'Kubernetes failed to wait for all the rollout and/or all pods to be'
222-
' ready, even with retries. Full error: %s. Continuing for now. Failure'
223-
' will be checked later by number of pods with ready events.',
233+
logging.exception(
234+
'Kubernetes waited %s seconds for the rollout to complete and/or all'
235+
' pods to be ready, but they were not, even with retries. Full error:'
236+
' %s. Continuing for now. Failure will be checked later by number of'
237+
' pods with ready events.',
238+
max_wait_time,
224239
e,
225240
)
226241
return [], rollout_name
@@ -229,13 +244,15 @@ def ScaleUpPods(
229244
def _CheckForFailures(
230245
cluster: container_service.KubernetesCluster,
231246
pod_samples: list[sample.Sample],
247+
num_pods: int,
232248
):
233249
"""Fails the benchmark if not enough pods were created.
234250
235251
Args:
236252
cluster: The cluster to check for failures on.
237253
pod_samples: The samples from pod transition times which includes pod Ready
238254
count.
255+
num_pods: The number of pods we attempted to scale up to.
239256
240257
Raises:
241258
QuotaFailure: If a quota is exceeded.
@@ -269,32 +286,32 @@ def _CheckForFailures(
269286
)
270287
if (
271288
ready_count_sample is not None
272-
and ready_count_sample.value >= NUM_PODS.value
289+
and ready_count_sample.value >= num_pods
273290
):
274291
logging.info(
275292
'Benchmark successfully scaled up %d pods, which is equal to or more '
276293
'than the goal of %d pods.',
277294
ready_count_sample.value,
278-
NUM_PODS.value,
295+
num_pods,
279296
)
280297
return
281298
if 'FailedScaleUp' in failure_events_by_reason:
282299
for event in failure_events_by_reason['FailedScaleUp']:
283300
if 'quota exceeded' in event.message:
284301
raise errors.Benchmarks.QuotaFailure(
285302
'Failed to scale up to %d pods, at least one pod ran into a quota'
286-
' error: %s' % (NUM_PODS.value, event.message)
303+
' error: %s' % (num_pods, event.message)
287304
)
288305
if ready_count_sample is None:
289306
raise errors.Benchmarks.RunError(
290307
'No pod ready events were found & we attempted to scale up to'
291-
f' {NUM_PODS.value} pods.'
308+
f' {num_pods} pods.'
292309
)
293310

294311
raise errors.Benchmarks.RunError(
295312
'Benchmark attempted to scale up to %d pods but only %d pods were'
296313
' created & ready. Check above "Kubernetes failed to wait for" logs for'
297-
' exact failure location.' % (NUM_PODS.value, ready_count_sample.value)
314+
' exact failure location.' % (num_pods, ready_count_sample.value)
298315
)
299316

300317

tests/container_service_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,10 @@ def setUp(self):
136136
)
137137
)
138138

139-
def test_apply_manifest_gets_deployment_name(self):
139+
@parameterized.parameters(('created'), ('configured'))
140+
def test_apply_manifest_gets_deployment_name(self, suffix):
140141
self.MockIssueCommand(
141-
{'apply -f': [('deployment.apps/test-deployment created', '', 0)]}
142+
{'apply -f': [(f'deployment.apps/test-deployment {suffix}', '', 0)]}
142143
)
143144
self.enter_context(
144145
mock.patch.object(

tests/linux_benchmarks/kubernetes_scale_benchmark_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def testCheckFailuresPassesWithCorrectNumberOfPods(self):
322322
sample.Sample('pod_Ready_p90', 95.0, 'seconds'),
323323
sample.Sample('pod_Ready_count', 10, 'count'),
324324
],
325+
9,
325326
)
326327

327328
@flagsaver.flagsaver(kubernetes_scale_num_replicas=10)
@@ -344,6 +345,7 @@ def testCheckFailuresThrowsRegularError(self):
344345
[
345346
sample.Sample('pod_Ready_count', 5, 'count'),
346347
],
348+
9,
347349
)
348350

349351
@flagsaver.flagsaver(kubernetes_scale_num_replicas=10)
@@ -370,6 +372,7 @@ def testCheckFailuresThrowsQuotaExceeded(self):
370372
[
371373
sample.Sample('pod_Ready_count', 5, 'count'),
372374
],
375+
9,
373376
)
374377

375378

0 commit comments

Comments
 (0)