Always scale to 1 first before scaling larger

hubatish · copybara-github · commit baf27615ab7a · 2025-10-16T15:09:55.000-07:00
This standardizes performance of GKE Standard (&amp; AKS Standard) vs GKE, EKS, AKS Auto &amp; EKS Karpenter. The Standard variants all spin up the first pod on their default nodepool (which matches the requested machine type), whereas the Auto ones spin up starting from 0.

PiperOrigin-RevId: 820411378
diff --git a/perfkitbenchmarker/container_service.py b/perfkitbenchmarker/container_service.py
@@ -1005,7 +1005,7 @@ def _ParseApplyOutput(stdout: str) -> Iterator[str]:
       """Parses the output of kubectl apply to get the name of the resource."""
       # Example input: deployment.apps/pkb123 created
       for line in stdout.splitlines():
-        match = re.search(r'([^\s/]+/[^\s/]+) created', line)
+        match = re.search(r'([^\s/]+/[^\s/]+) (created|configured)', line)
         if match:
           yield match.group(1)
 
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py
@@ -123,13 +123,27 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
   cluster = bm_spec.container_cluster
   assert isinstance(cluster, container_service.KubernetesCluster)
 
+  # Warm up the cluster by creating a single pod. This compensates for
+  # differences between Standard & Autopilot, where Standard already has 1 node
+  # due to its starting nodepool but Autopilot does not.
+  scale_one_samples, _ = ScaleUpPods(cluster, 1)
+  if not scale_one_samples:
+    logging.exception(
+        'Failed to scale up to 1 pod; now investigating failure reasons.'
+    )
+    unused = 0
+    pod_samples = ParseStatusChanges('pod', unused)
+    # Log & check for quota failure.
+    _CheckForFailures(cluster, pod_samples, 1)
+
   initial_nodes = set(cluster.GetNodeNames())
+  initial_pods = set(cluster.GetPodNames())
 
-  samples, rollout_name = ScaleUpPods(cluster)
+  samples, rollout_name = ScaleUpPods(cluster, NUM_PODS.value)
   start_time = _GetRolloutCreationTime(rollout_name)
-  pod_samples = ParseStatusChanges('pod', start_time)
+  pod_samples = ParseStatusChanges('pod', start_time, initial_pods)
   samples += pod_samples
-  _CheckForFailures(cluster, pod_samples)
+  _CheckForFailures(cluster, pod_samples, NUM_PODS.value - 1)
   samples += ParseStatusChanges(
       'node', start_time, resources_to_ignore=initial_nodes
   )
@@ -149,6 +163,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
 
 def ScaleUpPods(
     cluster: container_service.KubernetesCluster,
+    num_new_pods: int,
 ) -> tuple[list[sample.Sample], str]:
   """Scales up pods on a kubernetes cluster. Returns samples & rollout name."""
   samples = []
@@ -162,7 +177,6 @@ def ScaleUpPods(
     command = ['sh', '-c', 'sleep infinity']
 
   # Request X new pods via YAML apply.
-  num_new_pods = NUM_PODS.value
   max_wait_time = _GetScaleTimeout()
   resource_names = cluster.ApplyManifest(
       MANIFEST_TEMPLATE,
@@ -182,7 +196,6 @@ def ScaleUpPods(
       cloud='Azure' if FLAGS.cloud == 'Azure' else None,
   )
 
-  # Arbitrarily pick the first resource (it should be the only one.)
   assert resource_names
   rollout_name = next(resource_names)
 
@@ -217,10 +230,12 @@ def ScaleUpPods(
       errors.VmUtil.IssueCommandTimeoutError,
       vm_util.TimeoutExceededRetryError,
   ) as e:
-    logging.warning(
-        'Kubernetes failed to wait for all the rollout and/or all pods to be'
-        ' ready, even with retries. Full error: %s. Continuing for now. Failure'
-        ' will be checked later by number of pods with ready events.',
+    logging.exception(
+        'Kubernetes waited %s seconds for the rollout to complete and/or all'
+        ' pods to be ready, but they were not, even with retries. Full error:'
+        ' %s. Continuing for now. Failure will be checked later by number of'
+        ' pods with ready events.',
+        max_wait_time,
         e,
     )
     return [], rollout_name
@@ -229,13 +244,15 @@ def ScaleUpPods(
 def _CheckForFailures(
     cluster: container_service.KubernetesCluster,
     pod_samples: list[sample.Sample],
+    num_pods: int,
 ):
   """Fails the benchmark if not enough pods were created.
 
   Args:
     cluster: The cluster to check for failures on.
     pod_samples: The samples from pod transition times which includes pod Ready
       count.
+    num_pods: The number of pods we attempted to scale up to.
 
   Raises:
     QuotaFailure: If a quota is exceeded.
@@ -269,32 +286,32 @@ def _CheckForFailures(
   )
   if (
       ready_count_sample is not None
-      and ready_count_sample.value >= NUM_PODS.value
+      and ready_count_sample.value >= num_pods
   ):
     logging.info(
         'Benchmark successfully scaled up %d pods, which is equal to or more '
         'than the goal of %d pods.',
         ready_count_sample.value,
-        NUM_PODS.value,
+        num_pods,
     )
     return
   if 'FailedScaleUp' in failure_events_by_reason:
     for event in failure_events_by_reason['FailedScaleUp']:
       if 'quota exceeded' in event.message:
         raise errors.Benchmarks.QuotaFailure(
             'Failed to scale up to %d pods, at least one pod ran into a quota'
-            ' error: %s' % (NUM_PODS.value, event.message)
+            ' error: %s' % (num_pods, event.message)
         )
   if ready_count_sample is None:
     raise errors.Benchmarks.RunError(
         'No pod ready events were found & we attempted to scale up to'
-        f' {NUM_PODS.value} pods.'
+        f' {num_pods} pods.'
     )
 
   raise errors.Benchmarks.RunError(
       'Benchmark attempted to scale up to  %d pods but only %d pods were'
       ' created & ready. Check above "Kubernetes failed to wait for" logs for'
-      ' exact failure location.' % (NUM_PODS.value, ready_count_sample.value)
+      ' exact failure location.' % (num_pods, ready_count_sample.value)
   )
 
 
diff --git a/tests/container_service_test.py b/tests/container_service_test.py
@@ -136,9 +136,10 @@ def setUp(self):
         )
     )
 
-  def test_apply_manifest_gets_deployment_name(self):
+  @parameterized.parameters(('created'), ('configured'))
+  def test_apply_manifest_gets_deployment_name(self, suffix):
     self.MockIssueCommand(
-        {'apply -f': [('deployment.apps/test-deployment created', '', 0)]}
+        {'apply -f': [(f'deployment.apps/test-deployment {suffix}', '', 0)]}
     )
     self.enter_context(
         mock.patch.object(
diff --git a/tests/linux_benchmarks/kubernetes_scale_benchmark_test.py b/tests/linux_benchmarks/kubernetes_scale_benchmark_test.py
@@ -322,6 +322,7 @@ def testCheckFailuresPassesWithCorrectNumberOfPods(self):
             sample.Sample('pod_Ready_p90', 95.0, 'seconds'),
             sample.Sample('pod_Ready_count', 10, 'count'),
         ],
+        9,
     )
 
   @flagsaver.flagsaver(kubernetes_scale_num_replicas=10)
@@ -344,6 +345,7 @@ def testCheckFailuresThrowsRegularError(self):
           [
               sample.Sample('pod_Ready_count', 5, 'count'),
           ],
+          9,
       )
 
   @flagsaver.flagsaver(kubernetes_scale_num_replicas=10)
@@ -370,6 +372,7 @@ def testCheckFailuresThrowsQuotaExceeded(self):
           [
               sample.Sample('pod_Ready_count', 5, 'count'),
           ],
+          9,
       )
 
 

Original file line number	Diff line number	Diff line change
`@@ -136,9 +136,10 @@ def setUp(self):`
`136`	`136`	`)`
`137`	`137`	`)`
`138`	`138`
`139`		`- def test_apply_manifest_gets_deployment_name(self):`
	`139`	`+ @parameterized.parameters(('created'), ('configured'))`
	`140`	`+ def test_apply_manifest_gets_deployment_name(self, suffix):`
`140`	`141`	`self.MockIssueCommand(`
`141`		`- {'apply -f': [('deployment.apps/test-deployment created', '', 0)]}`
	`142`	`+ {'apply -f': [(f'deployment.apps/test-deployment {suffix}', '', 0)]}`
`142`	`143`	`)`
`143`	`144`	`self.enter_context(`
`144`	`145`	`mock.patch.object(`
Original file line number	Diff line number	Diff line change
`@@ -322,6 +322,7 @@ def testCheckFailuresPassesWithCorrectNumberOfPods(self):`
`322`	`322`	`sample.Sample('pod_Ready_p90', 95.0, 'seconds'),`
`323`	`323`	`sample.Sample('pod_Ready_count', 10, 'count'),`
`324`	`324`	`],`
	`325`	`+ 9,`
`325`	`326`	`)`
`326`	`327`
`327`	`328`	`@flagsaver.flagsaver(kubernetes_scale_num_replicas=10)`
`@@ -344,6 +345,7 @@ def testCheckFailuresThrowsRegularError(self):`
`344`	`345`	`[`
`345`	`346`	`sample.Sample('pod_Ready_count', 5, 'count'),`
`346`	`347`	`],`
	`348`	`+ 9,`
`347`	`349`	`)`
`348`	`350`
`349`	`351`	`@flagsaver.flagsaver(kubernetes_scale_num_replicas=10)`
`@@ -370,6 +372,7 @@ def testCheckFailuresThrowsQuotaExceeded(self):`
`370`	`372`	`[`
`371`	`373`	`sample.Sample('pod_Ready_count', 5, 'count'),`
`372`	`374`	`],`
	`375`	`+ 9,`
`373`	`376`	`)`
`374`	`377`
`375`	`378`