Skip to content

Commit 0eb5626

Browse files
hubatishcopybara-github
authored andcommitted
Run AddNodepool in parallel
This matches the nodepool provisioning implementation for GKE, which responds to the first yaml / manifest application rather than waiting on individual AddNodepool commands like EKS & AKS. This should make results somewhat more comparable. Also added more detailed samples for apply time vs "wait time" to help confirm that the majority of time is spent waiting for Kubernetes to do operations in the background rather than actively writing & applying yamls. PiperOrigin-RevId: 828046506
1 parent fb4db29 commit 0eb5626

File tree

2 files changed

+135
-27
lines changed

2 files changed

+135
-27
lines changed

perfkitbenchmarker/linux_benchmarks/provisioning_benchmarks/provision_node_pools_benchmark.py

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import time
3939
from typing import List
4040
from absl import flags
41+
from perfkitbenchmarker import background_tasks
4142
from perfkitbenchmarker import benchmark_spec as bm_spec
4243
from perfkitbenchmarker import configs
4344
from perfkitbenchmarker import container_service
@@ -94,9 +95,25 @@ def Prepare(_: bm_spec.BenchmarkSpec) -> None:
9495
pass
9596

9697

98+
def _AddNodePool(
99+
cluster: container_service.KubernetesCluster,
100+
batch_name: str,
101+
pool_id: str,
102+
) -> None:
103+
"""Adds a node pool to the cluster."""
104+
cluster.AddNodepool(batch_name, pool_id=pool_id)
105+
cluster.ApplyManifest(
106+
JOB_MANIFEST_TEMPLATE,
107+
batch=batch_name,
108+
gpu=USE_GPU.value,
109+
cloud=cluster.CLOUD,
110+
id=pool_id,
111+
)
112+
113+
97114
def _CreateJobsAndWait(
98115
cluster: container_service.KubernetesCluster, batch_name: str, jobs: int
99-
) -> None:
116+
) -> list[sample.Sample]:
100117
"""Creates jobs and waits for all pods to be running."""
101118
logging.info(
102119
"Creating batch '%s' of %d jobs, each job running in a separate node in a"
@@ -105,22 +122,33 @@ def _CreateJobsAndWait(
105122
jobs,
106123
)
107124

125+
samples = []
108126
apply_start = time.monotonic()
109-
for i in range(jobs):
110-
cluster.AddNodepool(batch_name, pool_id="{:03d}".format(i + 1))
111-
cluster.ApplyManifest(
112-
JOB_MANIFEST_TEMPLATE,
113-
batch=batch_name,
114-
gpu=USE_GPU.value,
115-
cloud=cluster.CLOUD,
116-
id="{:03d}".format(i + 1),
117-
)
127+
tasks = []
128+
for i in range(2, jobs + 1):
129+
tasks.append((
130+
_AddNodePool,
131+
[cluster, batch_name, "{:03d}".format(i)],
132+
{},
133+
))
134+
# Add the first node pool + batch prior to the rest.
135+
logging.info("Creating the first node pool 001")
136+
_AddNodePool(cluster, batch_name, "001")
137+
background_tasks.RunParallelThreads(tasks, len(tasks))
138+
apply_time = time.monotonic() - apply_start
118139
logging.info(
119140
"Created %d jobs in batch '%s' in %d seconds. Waiting for all pods to be"
120141
" running",
121142
jobs,
122143
batch_name,
123-
time.monotonic() - apply_start,
144+
apply_time,
145+
)
146+
samples.append(
147+
sample.Sample(
148+
"apply_time",
149+
apply_time,
150+
"seconds",
151+
)
124152
)
125153
start = time.monotonic()
126154
# wait up to 2 min per node pool + 45 min for master resizes
@@ -144,7 +172,7 @@ def _CreateJobsAndWait(
144172
if running >= jobs:
145173
break
146174
logging.info(
147-
"Running jobs in batch '%s': %d/%d. Time: %d seconds.",
175+
"Waiting for jobs in batch '%s': %d/%d. Time: %d seconds.",
148176
batch_name,
149177
running,
150178
jobs,
@@ -165,12 +193,21 @@ def _CreateJobsAndWait(
165193
% batch_name
166194
)
167195
time.sleep(60)
196+
ready_time = time.monotonic() - start
168197
logging.info(
169198
"All %d jobs in batch '%s' are running. Wait time: %d seconds.",
170199
jobs,
171200
batch_name,
172-
time.monotonic() - start,
201+
ready_time,
202+
)
203+
samples.append(
204+
sample.Sample(
205+
"ready_time",
206+
ready_time,
207+
"seconds",
208+
)
173209
)
210+
return samples
174211

175212

176213
def _AssertNodes(
@@ -226,26 +263,39 @@ def _CreateNodePools(
226263
nodes_before = len(cluster.GetNodeNames())
227264
nodes_pools_before = len(cluster.GetNodePoolNames())
228265
start = time.monotonic()
229-
_CreateJobsAndWait(cluster, batch_name, node_pools_to_add)
266+
samples = _CreateJobsAndWait(cluster, batch_name, node_pools_to_add)
230267
elapsed = time.monotonic() - start
231268
_AssertNodes(cluster, nodes_before, node_pools_to_add)
232269
_AssertNodePools(cluster, nodes_pools_before, node_pools_to_add)
233-
metadata = {"node_pools_created": node_pools_to_add}
234-
metric_batch_name = batch_name.replace("-", "_")
235-
return [
270+
samples.append(
236271
sample.Sample(
237-
"%s_provisioning_time" % metric_batch_name,
272+
"provisioning_time",
238273
elapsed,
239274
"seconds",
240-
metadata,
241-
),
242-
sample.Sample(
243-
"%s_provisioning_time_per_node_pool" % metric_batch_name,
244-
elapsed / node_pools_to_add,
245-
"seconds",
246-
metadata,
247-
),
248-
]
275+
)
276+
)
277+
metadata = {"node_pools_created": node_pools_to_add}
278+
metric_batch_name = batch_name.replace("-", "_")
279+
final_samples = []
280+
for s in samples:
281+
metric_name = f"{metric_batch_name}_{s.metric}"
282+
final_samples.append(
283+
sample.Sample(
284+
metric_name,
285+
s.value,
286+
s.unit,
287+
metadata,
288+
)
289+
)
290+
final_samples.append(
291+
sample.Sample(
292+
f"{metric_name}_per_node_pool",
293+
s.value / node_pools_to_add,
294+
s.unit,
295+
s.metadata,
296+
)
297+
)
298+
return final_samples
249299

250300

251301
def Run(benchmark_spec: bm_spec.BenchmarkSpec) -> List[sample.Sample]:
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import unittest
2+
from unittest import mock
3+
4+
from absl.testing import flagsaver
5+
from perfkitbenchmarker import benchmark_spec
6+
from perfkitbenchmarker import container_service
7+
from perfkitbenchmarker.linux_benchmarks.provisioning_benchmarks import provision_node_pools_benchmark
8+
from tests import pkb_common_test_case
9+
10+
11+
class ProvisionNodePoolsBenchmarkTest(pkb_common_test_case.PkbCommonTestCase):
12+
13+
def setUp(self):
14+
super().setUp()
15+
self.cluster = mock.create_autospec(
16+
container_service.KubernetesCluster, instance=True
17+
)
18+
self.cluster.CLOUD = 'GCP'
19+
20+
def setUpWithXNodes(self, num_nodes: int):
21+
many_nodes = ['foo'] * num_nodes
22+
self.enter_context(
23+
mock.patch.object(
24+
container_service,
25+
'RunKubectlCommand',
26+
return_value=(' '.join(many_nodes), None, None),
27+
)
28+
)
29+
self.cluster.GetNodeNames.return_value = many_nodes
30+
self.cluster.GetNodePoolNames.return_value = many_nodes
31+
32+
def test_IndividualChecks(self):
33+
self.setUpWithXNodes(20)
34+
provision_node_pools_benchmark._AssertNodes(self.cluster, 10, 10)
35+
provision_node_pools_benchmark._AssertNodePools(self.cluster, 10, 10)
36+
37+
@flagsaver.flagsaver(provision_node_pools_init_batch=0)
38+
@flagsaver.flagsaver(provision_node_pools_test_batch=10)
39+
def test_FullRun(self):
40+
self.setUpWithXNodes(10)
41+
b_spec = mock.create_autospec(benchmark_spec.BenchmarkSpec, instance=True)
42+
b_spec.container_cluster = self.cluster
43+
samples = provision_node_pools_benchmark.Run(b_spec)
44+
metrics = [s.metric for s in samples]
45+
self.assertContainsSubset(
46+
[
47+
'test_batch_apply_time',
48+
'total_time',
49+
'total_time_per_node_pool',
50+
'test_batch_ready_time',
51+
'test_batch_provisioning_time',
52+
],
53+
metrics,
54+
)
55+
56+
57+
if __name__ == '__main__':
58+
unittest.main()

0 commit comments

Comments
 (0)