Skip to content

Commit 90f24f7

Browse files
p3rf Teamcopybara-github
authored andcommitted
Add a rampup test for VPA
This test is similar to the HPA test (linux_benchmarks/kubernetes_hpa.py), except that it uses VPA. Due to VPA's longer reaction time, the test runs over a much longer period as well. PiperOrigin-RevId: 836241834
1 parent 923bd4e commit 90f24f7

File tree

10 files changed

+792
-28
lines changed

10 files changed

+792
-28
lines changed

perfkitbenchmarker/configs/container_spec.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,10 @@ def _GetOptionDecoderConstructions(cls):
483483
kubernetes_inference_server_spec.InferenceServerConfigDecoder,
484484
{'default': None, 'none_ok': True},
485485
),
486+
'enable_vpa': (
487+
option_decoders.BooleanDecoder,
488+
{'default': False},
489+
),
486490
})
487491
return result
488492

perfkitbenchmarker/container_service.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ def __init__(self, cluster_spec: container_spec_lib.ContainerClusterSpec):
509509
self.services: dict[str, KubernetesContainerService] = {}
510510
self._extra_samples: list[sample.Sample] = []
511511
self.container_registry: BaseContainerRegistry | None = None
512+
self.enable_vpa: bool = cluster_spec.enable_vpa
512513

513514
@property
514515
def num_nodes(self) -> int:
@@ -1320,6 +1321,214 @@ def _Sample(count: int, state: str) -> Sample:
13201321
_Sample(unknown, 'unknown'),
13211322
]
13221323

1324+
@staticmethod
1325+
def _GetPodNamesForResource(
1326+
resource_name: str, namespace: str = ''
1327+
) -> list[str]:
1328+
"""Gets the names of pods managed by a resource (e.g., deployment).
1329+
1330+
The resource must have a .spec.selector.matchLabels defined, and non-empty.
1331+
1332+
Args:
1333+
resource_name: the resource type and name, e.g. 'deployment/my_deploy'.
1334+
namespace: The namespace of the resource.
1335+
1336+
Raises:
1337+
ValueError: if the resource does not have a (non-empty)
1338+
`.spec.selector.matchLabels` field or if the field cannot be parsed.
1339+
"""
1340+
# NB: If this approach is insufficient, then we could convert this to use
1341+
# ownerReferences instead. This would mean going over several hops (i.e.
1342+
# deployment->replicaset->pods) so probably is not worth it unless the
1343+
# current approach prooves inadequate.
1344+
1345+
# 1. Get the selector from the resource.
1346+
get_selector_cmd = [
1347+
'get',
1348+
resource_name,
1349+
'-n',
1350+
namespace,
1351+
"-o=jsonpath='{.spec.selector.matchLabels}'",
1352+
]
1353+
selector_stdout, stderr, retcode = RunKubectlCommand(
1354+
get_selector_cmd, raise_on_failure=False
1355+
)
1356+
if retcode != 0:
1357+
if 'NotFound' in stderr:
1358+
return []
1359+
raise errors.VmUtil.IssueCommandError(
1360+
f'Failed to get selector for resource {resource_name}: {stderr}'
1361+
)
1362+
1363+
try:
1364+
# The output can be empty if there are no labels (which violates the
1365+
# pre-condition).
1366+
selector_str = selector_stdout.strip("'")
1367+
if not selector_str:
1368+
raise ValueError(
1369+
'A resource without a .spec.selector.matchLabels was passed to'
1370+
' _GetPodNamesForResource.'
1371+
)
1372+
selector_dict = json.loads(selector_str)
1373+
except json.JSONDecodeError as e:
1374+
raise ValueError(
1375+
f'Could not decode selector for resource {resource_name}:'
1376+
f' {selector_stdout}'
1377+
) from e
1378+
1379+
if not selector_dict:
1380+
raise ValueError(
1381+
'A resource without a (non-empty) .spec.selector.matchLabels was'
1382+
' passed to _GetPodNamesForResource.'
1383+
)
1384+
1385+
# 2. Construct the label selector string.
1386+
selector_parts = [f'{key}={value}' for key, value in selector_dict.items()]
1387+
label_selector = ','.join(selector_parts)
1388+
1389+
# 3. Get pods using the selector.
1390+
get_pods_cmd = [
1391+
'get',
1392+
'pods',
1393+
'-l',
1394+
label_selector,
1395+
'-n',
1396+
namespace,
1397+
'-o=jsonpath={.items[*].metadata.name}',
1398+
]
1399+
pods_stdout, _, _ = RunKubectlCommand(get_pods_cmd)
1400+
1401+
return pods_stdout.strip().split()
1402+
1403+
@staticmethod
1404+
def GetCPURequestSamples(
1405+
resource_name: str, namespace: str = ''
1406+
) -> list[Sample]:
1407+
"""Returns the CPU requests for all pods within the specified resource.
1408+
1409+
Args:
1410+
resource_name: The deployment/statefulset/etc's name, e.g.
1411+
'deployment/my_deployment'.
1412+
namespace: The namespace of the resource. If omitted, the 'default'
1413+
namespace will be used.
1414+
1415+
Returns:
1416+
A list of Samples, each representing the CPU request of a pod.
1417+
"""
1418+
now = int(time.time())
1419+
1420+
pod_names = KubernetesClusterCommands._GetPodNamesForResource(
1421+
resource_name, namespace
1422+
)
1423+
samples = []
1424+
1425+
for pod_name in pod_names:
1426+
# Get CPU requests for each pod
1427+
cpu_request_stdout, _, _ = RunKubectlCommand(
1428+
[
1429+
'get',
1430+
'pod',
1431+
pod_name,
1432+
'-n',
1433+
namespace,
1434+
'-o=jsonpath={.spec.containers[*].resources.requests.cpu}',
1435+
],
1436+
)
1437+
1438+
# Convert CPU string (e.g., "100m", "1") to float (cores)
1439+
cpu_request_str = cpu_request_stdout.strip()
1440+
if not cpu_request_str:
1441+
continue
1442+
if cpu_request_str.endswith('m'):
1443+
cpu_request = float(cpu_request_str[:-1]) / 1000
1444+
else:
1445+
cpu_request = float(cpu_request_str)
1446+
1447+
samples.append(
1448+
Sample(
1449+
metric='kubernetes_cpu_request',
1450+
value=cpu_request,
1451+
unit='cores',
1452+
metadata={
1453+
'namespace': namespace,
1454+
'resource_name': resource_name,
1455+
'pod': pod_name,
1456+
},
1457+
timestamp=now,
1458+
)
1459+
)
1460+
return samples
1461+
1462+
@staticmethod
1463+
def GetCPUUsageSamples(
1464+
resource_name: str, namespace: str = ''
1465+
) -> list[Sample]:
1466+
"""Returns the CPU usage for all pods within the specified resource.
1467+
1468+
Args:
1469+
resource_name: The deployment/statefulset/etc's name, e.g.
1470+
'deployment/my_deployment'.
1471+
namespace: The namespace of the resource. If omitted, the 'default'
1472+
namespace will be used.
1473+
1474+
Returns:
1475+
A list of Samples, each representing the CPU usage of a pod.
1476+
"""
1477+
now = int(time.time())
1478+
1479+
pod_names = KubernetesClusterCommands._GetPodNamesForResource(
1480+
resource_name, namespace
1481+
)
1482+
samples = []
1483+
1484+
for pod_name in pod_names:
1485+
# Get CPU usage for each pod using kubectl top
1486+
# kubectl top pod <pod-name> --namespace <namespace> --containers
1487+
# This returns output like:
1488+
# POD NAME CPU(cores) MEMORY(bytes)
1489+
# fib-xyz fib 10m 20Mi
1490+
top_output, stderr, retcode = RunKubectlCommand(
1491+
['top', 'pod', pod_name, '--namespace', namespace, '--containers'],
1492+
raise_on_failure=False,
1493+
)
1494+
if retcode != 0:
1495+
logging.warning(
1496+
'Could not get CPU usage for pod %s: %s', pod_name, stderr
1497+
)
1498+
continue
1499+
1500+
# Parse the output to get CPU usage
1501+
# Skip header and split lines
1502+
lines = top_output.strip().split('\n')[1:]
1503+
for line in lines:
1504+
parts = line.split()
1505+
if len(parts) < 4:
1506+
raise errors.VmUtil.IssueCommandError(
1507+
f'Unexpected output line from kubectl top: {line}'
1508+
)
1509+
1510+
cpu_usage_str = parts[2]
1511+
if cpu_usage_str.endswith('m'):
1512+
cpu_usage = float(cpu_usage_str[:-1]) / 1000
1513+
else:
1514+
cpu_usage = float(cpu_usage_str)
1515+
1516+
samples.append(
1517+
Sample(
1518+
metric='kubernetes_cpu_usage',
1519+
value=cpu_usage,
1520+
unit='cores',
1521+
metadata={
1522+
'namespace': namespace,
1523+
'resource_name': resource_name,
1524+
'pod': pod_name,
1525+
'container': parts[1], # Container name
1526+
},
1527+
timestamp=now,
1528+
)
1529+
)
1530+
return samples
1531+
13231532
@staticmethod
13241533
def CreateConfigMap(name: str, from_file_dir: str):
13251534
"""Creates a Kubernetes ConfigMap.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: fib
5+
---
6+
apiVersion: apps/v1
7+
kind: Deployment
8+
metadata:
9+
name: fib
10+
namespace: fib
11+
spec:
12+
replicas: 10
13+
selector:
14+
matchLabels:
15+
app: "fib"
16+
template:
17+
metadata:
18+
labels:
19+
app: "fib"
20+
spec:
21+
containers:
22+
- name: "fib"
23+
image: {{ fib_image }}
24+
imagePullPolicy: "Always"
25+
resources:
26+
requests:
27+
cpu: "5m"
28+
memory: "256Mi"
29+
limits:
30+
cpu: "10m"
31+
memory: "256Mi"
32+
ports:
33+
- containerPort: {{ port }}
34+
name: "web"
35+
protocol: "TCP"
36+
nodeSelector:
37+
{%- for node_selector in node_selectors %}
38+
{{node_selector}}
39+
{%- endfor %}
40+
---
41+
apiVersion: autoscaling.k8s.io/v1
42+
kind: VerticalPodAutoscaler
43+
metadata:
44+
name: fib
45+
namespace: fib
46+
spec:
47+
targetRef:
48+
apiVersion: "apps/v1"
49+
kind: "Deployment"
50+
name: "fib"
51+
updatePolicy:
52+
updateMode: "Auto"
53+
resourcePolicy:
54+
containerPolicies:
55+
- containerName: '*'
56+
maxAllowed:
57+
cpu: "500m"
58+
memory: "256Mi"
59+
minAllowed:
60+
memory: "256Mi"
61+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Locust file to simulate a "stepped" rampup of load."""
2+
3+
import locust
4+
5+
6+
class Rampup(locust.HttpUser):
7+
# Send 1QPS (per user)
8+
wait_time = locust.constant_throughput(1)
9+
10+
@locust.task
11+
def rampup(self):
12+
# Close the connection after each request (or else users won't get load
13+
# balanced to new pods.)
14+
headers = {"Connection": "close"}
15+
16+
self.client.get("/calculate", headers=headers, timeout=5.0)
17+
18+
19+
class StagesShape(locust.LoadTestShape):
20+
"""Locust LoadTestShape to simulate a "stepped" rampup.
21+
22+
NB: Because VPA is (deliberately) slow to react, we need to specify a lengthy
23+
rampup, measured in hours, rather than seconds.
24+
"""
25+
26+
# pyformat: disable
27+
# pylint: disable=bad-whitespace
28+
_stages = [
29+
{"endtime": 1*60*60, "users": 1}, # 1 rps for 1h
30+
{"endtime": 4*60*60, "users": 20}, # 20 rps for 3h
31+
{"endtime": 5*60*60, "users": 40}, # 40 rps for 1h
32+
{"endtime": 6*60*60, "users": 60}, # 60 rps for 1h
33+
{"endtime": 7*60*60, "users": 90}, # 90 rps for 1h
34+
{"endtime": 9*60*60, "users": 120}, # 120 rps for 2h
35+
{"endtime": 11*60*60, "users": 150}, # 150 rps for 2h
36+
{"endtime": 12*60*60, "users": 1}, # 1 rps for 1h
37+
# --------------
38+
# Total: 12h
39+
]
40+
# pyformat: enable
41+
42+
def tick(self):
43+
run_time = self.get_run_time()
44+
45+
for stage in self._stages:
46+
if run_time < stage["endtime"]:
47+
user_count = stage["users"]
48+
spawn_rate = 100 # spawn all new users roughly immediately (over 1s)
49+
return (user_count, spawn_rate)
50+
51+
return None

0 commit comments

Comments
 (0)