@@ -123,13 +123,27 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
123123 cluster = bm_spec .container_cluster
124124 assert isinstance (cluster , container_service .KubernetesCluster )
125125
126+ # Warm up the cluster by creating a single pod. This compensates for
127+ # differences between Standard & Autopilot, where Standard already has 1 node
128+ # due to its starting nodepool but Autopilot does not.
129+ scale_one_samples , _ = ScaleUpPods (cluster , 1 )
130+ if not scale_one_samples :
131+ logging .exception (
132+ 'Failed to scale up to 1 pod; now investigating failure reasons.'
133+ )
134+ unused = 0
135+ pod_samples = ParseStatusChanges ('pod' , unused )
136+ # Log & check for quota failure.
137+ _CheckForFailures (cluster , pod_samples , 1 )
138+
126139 initial_nodes = set (cluster .GetNodeNames ())
140+ initial_pods = set (cluster .GetPodNames ())
127141
128- samples , rollout_name = ScaleUpPods (cluster )
142+ samples , rollout_name = ScaleUpPods (cluster , NUM_PODS . value )
129143 start_time = _GetRolloutCreationTime (rollout_name )
130- pod_samples = ParseStatusChanges ('pod' , start_time )
144+ pod_samples = ParseStatusChanges ('pod' , start_time , initial_pods )
131145 samples += pod_samples
132- _CheckForFailures (cluster , pod_samples )
146+ _CheckForFailures (cluster , pod_samples , NUM_PODS . value - 1 )
133147 samples += ParseStatusChanges (
134148 'node' , start_time , resources_to_ignore = initial_nodes
135149 )
@@ -149,6 +163,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
149163
150164def ScaleUpPods (
151165 cluster : container_service .KubernetesCluster ,
166+ num_new_pods : int ,
152167) -> tuple [list [sample .Sample ], str ]:
153168 """Scales up pods on a kubernetes cluster. Returns samples & rollout name."""
154169 samples = []
@@ -162,7 +177,6 @@ def ScaleUpPods(
162177 command = ['sh' , '-c' , 'sleep infinity' ]
163178
164179 # Request X new pods via YAML apply.
165- num_new_pods = NUM_PODS .value
166180 max_wait_time = _GetScaleTimeout ()
167181 resource_names = cluster .ApplyManifest (
168182 MANIFEST_TEMPLATE ,
@@ -182,7 +196,6 @@ def ScaleUpPods(
182196 cloud = 'Azure' if FLAGS .cloud == 'Azure' else None ,
183197 )
184198
185- # Arbitrarily pick the first resource (it should be the only one.)
186199 assert resource_names
187200 rollout_name = next (resource_names )
188201
@@ -217,10 +230,12 @@ def ScaleUpPods(
217230 errors .VmUtil .IssueCommandTimeoutError ,
218231 vm_util .TimeoutExceededRetryError ,
219232 ) as e :
220- logging .warning (
221- 'Kubernetes failed to wait for all the rollout and/or all pods to be'
222- ' ready, even with retries. Full error: %s. Continuing for now. Failure'
223- ' will be checked later by number of pods with ready events.' ,
233+ logging .exception (
234+ 'Kubernetes waited %s seconds for the rollout to complete and/or all'
235+ ' pods to be ready, but they were not, even with retries. Full error:'
236+ ' %s. Continuing for now. Failure will be checked later by number of'
237+ ' pods with ready events.' ,
238+ max_wait_time ,
224239 e ,
225240 )
226241 return [], rollout_name
@@ -229,13 +244,15 @@ def ScaleUpPods(
229244def _CheckForFailures (
230245 cluster : container_service .KubernetesCluster ,
231246 pod_samples : list [sample .Sample ],
247+ num_pods : int ,
232248):
233249 """Fails the benchmark if not enough pods were created.
234250
235251 Args:
236252 cluster: The cluster to check for failures on.
237253 pod_samples: The samples from pod transition times which includes pod Ready
238254 count.
255+ num_pods: The number of pods we attempted to scale up to.
239256
240257 Raises:
241258 QuotaFailure: If a quota is exceeded.
@@ -269,32 +286,32 @@ def _CheckForFailures(
269286 )
270287 if (
271288 ready_count_sample is not None
272- and ready_count_sample .value >= NUM_PODS . value
289+ and ready_count_sample .value >= num_pods
273290 ):
274291 logging .info (
275292 'Benchmark successfully scaled up %d pods, which is equal to or more '
276293 'than the goal of %d pods.' ,
277294 ready_count_sample .value ,
278- NUM_PODS . value ,
295+ num_pods ,
279296 )
280297 return
281298 if 'FailedScaleUp' in failure_events_by_reason :
282299 for event in failure_events_by_reason ['FailedScaleUp' ]:
283300 if 'quota exceeded' in event .message :
284301 raise errors .Benchmarks .QuotaFailure (
285302 'Failed to scale up to %d pods, at least one pod ran into a quota'
286- ' error: %s' % (NUM_PODS . value , event .message )
303+ ' error: %s' % (num_pods , event .message )
287304 )
288305 if ready_count_sample is None :
289306 raise errors .Benchmarks .RunError (
290307 'No pod ready events were found & we attempted to scale up to'
291- f' { NUM_PODS . value } pods.'
308+ f' { num_pods } pods.'
292309 )
293310
294311 raise errors .Benchmarks .RunError (
295312 'Benchmark attempted to scale up to %d pods but only %d pods were'
296313 ' created & ready. Check above "Kubernetes failed to wait for" logs for'
297- ' exact failure location.' % (NUM_PODS . value , ready_count_sample .value )
314+ ' exact failure location.' % (num_pods , ready_count_sample .value )
298315 )
299316
300317
0 commit comments