Skip to content

Commit 529a16a

Browse files
committed
Migrate stressng and uperf workloads to native Kubernetes (no snafu/wrapper/operator)
1 parent ae9a37c commit 529a16a

File tree

257 files changed

+24388
-2243
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

257 files changed

+24388
-2243
lines changed

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ Choose one from the following list:
7474

7575
Not mandatory:
7676

77-
**auto:** NAMESPACE=benchmark-operator [ The default namespace is benchmark-operator ]
77+
**auto:** NAMESPACE=benchmark-runner [ The default namespace is benchmark-runner ]
7878

7979
**auto:** ODF_PVC=True [ True=ODF PVC storage, False=Ephemeral storage, default True ]
8080

@@ -84,8 +84,6 @@ Not mandatory:
8484

8585
**auto:** RUNNER_PATH=/tmp [ The default work space is /tmp ]
8686

87-
**optional:** PIN_NODE_BENCHMARK_OPERATOR=$PIN_NODE_BENCHMARK_OPERATOR [node selector for benchmark operator pod]
88-
8987
**optional:** PIN_NODE1=$PIN_NODE1 [node1 selector for running the workload]
9088

9189
**optional:** PIN_NODE2=$PIN_NODE2 [node2 selector for running the workload, i.e. uperf server and client, hammerdb database and workload]
@@ -111,17 +109,16 @@ Not mandatory:
111109
For example:
112110

113111
```sh
114-
podman run --rm -e WORKLOAD="hammerdb_pod_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE_BENCHMARK_OPERATOR="node_name-0" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
112+
podman run --rm -e WORKLOAD="hammerdb_pod_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
115113
```
116114
or
117115
```sh
118-
docker run --rm -e WORKLOAD="hammerdb_vm_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE_BENCHMARK_OPERATOR="node_name-0" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
116+
docker run --rm -e WORKLOAD="hammerdb_vm_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
119117
```
120118

121119
SAVE RUN ARTIFACTS LOCAL:
122120
1. add `-e SAVE_ARTIFACTS_LOCAL='True'` or `--save-artifacts-local=true`
123121
2. add `-v /tmp/benchmark-runner-run-artifacts:/tmp/benchmark-runner-run-artifacts`
124-
3. git clone -b v1.0.3 https://github.com/cloud-bulldozer/benchmark-operator /tmp/benchmark-operator
125122

126123
### Run vdbench workload in Pod using OpenShift
127124
![](media/benchmark-runner-demo.gif)

benchmark_runner/common/oc/oc.py

Lines changed: 111 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -802,18 +802,27 @@ def collect_events(self):
802802

803803
@typechecked
804804
@logger_time_stamp
805-
def get_pod(self, label: str, database: str = '', namespace: str = environment_variables.environment_variables_dict['namespace']):
805+
def get_pod(self, label: str = '', database: str = '', namespace: str = environment_variables.environment_variables_dict['namespace'], label_selector: str = ''):
806806
"""
807-
This method gets pods according to label
808-
:param label:
807+
This method gets pod name by name pattern or label selector
808+
:param label: pod name pattern (grep match)
809809
:param database:
810810
:param namespace:
811-
:return:
811+
:param label_selector: Kubernetes label selector (e.g. 'app=stressng_workload-<uuid>')
812+
:return: pod name
812813
"""
813814
if database:
814815
return self.run(
815816
f"{self._cli} get pods -n '{database}-db'" + " --no-headers | awk '{ print $1; }' | grep " + database,
816817
is_check=True).rstrip().decode('ascii')
818+
elif label_selector:
819+
namespace_opt = f'-n {namespace}' if namespace else ''
820+
result = self.run(
821+
f"{self._cli} get pods {namespace_opt} -l '{label_selector}' -o jsonpath='{{.items[0].metadata.name}}'",
822+
is_check=True)
823+
if isinstance(result, bytes):
824+
return result.decode('utf-8').strip().strip("'")
825+
return str(result).strip().strip("'") if result else ''
817826
else:
818827
namespace = f'-n {namespace}' if namespace else ''
819828
return self.run(f"{self._cli} get pods {namespace} --no-headers | awk '{{ print $1; }}' | grep -w '{label}'", is_check=True).rstrip().decode('ascii')
@@ -862,26 +871,27 @@ def get_pods(self):
862871

863872
@typechecked
864873
@logger_time_stamp
865-
def wait_for_pod_create(self, pod_name: str,
874+
def wait_for_pod_create(self, pod_name: str = '', label: str = '',
866875
namespace: str = environment_variables.environment_variables_dict['namespace'],
867876
timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
868877
"""
869-
This method waits till pod name is creating or throw exception after timeout
878+
This method waits till pod is created or throws exception after timeout.
879+
Can match by pod_name or label selector (for Job pods with random suffixes).
880+
:param pod_name: Pod name to match
881+
:param label: Label selector to match (e.g. 'app=stressng_workload-<uuid>')
870882
:param namespace:
871-
:param pod_name:
872883
:param timeout:
873-
:return: True if getting pod name or raise PodNameError
884+
:return: True if pod found or raise PodNotCreateTimeout
874885
"""
875886
current_wait_time = 0
876887
while timeout <= 0 or current_wait_time <= timeout:
877-
if self.pod_exists(pod_name=pod_name, namespace=namespace):
878-
self.describe_pod(pod_name=pod_name, namespace=namespace)
888+
if label and self.pod_label_exists(label_name=label, namespace=namespace):
889+
return True
890+
elif pod_name and self.pod_exists(pod_name=pod_name, namespace=namespace):
879891
return True
880-
# sleep for x seconds
881892
time.sleep(OC.SLEEP_TIME)
882893
current_wait_time += OC.SLEEP_TIME
883-
self.describe_pod(pod_name=pod_name, namespace=namespace)
884-
raise PodNotCreateTimeout(pod_name)
894+
raise PodNotCreateTimeout(pod_name or label)
885895

886896
@typechecked
887897
@logger_time_stamp
@@ -1163,13 +1173,23 @@ def wait_for_pod_completed(self, label: str, workload: str = '', label_uuid: boo
11631173
f"{self._cli} {namespace} wait --for=condition=failed -l {label}-{self.__get_short_uuid(workload=workload)} jobs --timeout={OC.SLEEP_TIME}s")
11641174
if 'met' in result:
11651175
return False
1166-
if not job:
1176+
elif job:
1177+
# Handle job=True with label_uuid=False (direct pod workloads)
1178+
result = self.run(
1179+
f"{self._cli} {namespace} wait --for=condition=complete -l {label} jobs --timeout={OC.SHORT_TIMEOUT}s")
1180+
if 'met' in result:
1181+
return True
1182+
result = self.run(
1183+
f"{self._cli} {namespace} wait --for=condition=failed -l {label} jobs --timeout={OC.SLEEP_TIME}s")
1184+
if 'met' in result:
1185+
return False
1186+
elif not job:
11671187
result = self.run(f"{self._cli} get pod -l {label}" + " -n benchmark-runner --no-headers | awk '{ print $3; }'")
11681188
if 'Completed' in result:
11691189
return True
1170-
# sleep for x seconds
1171-
time.sleep(OC.SLEEP_TIME)
1172-
current_wait_time += OC.SLEEP_TIME
1190+
# sleep for x seconds
1191+
time.sleep(OC.SLEEP_TIME)
1192+
current_wait_time += OC.SLEEP_TIME
11731193
except Exception as err:
11741194
raise PodNotCompletedTimeout(workload=workload)
11751195

@@ -1245,6 +1265,55 @@ def get_vm(self, label: str = '', namespace: str = environment_variables.environ
12451265
else:
12461266
return self.run(f'{self._cli} get vmi', is_check=True)
12471267

1268+
def _get_pod_field(self, field: str, label: str = '', pod_name: str = '', namespace: str = '') -> str:
1269+
"""
1270+
Get a pod field via jsonpath, by label selector or pod name
1271+
"""
1272+
namespace = namespace or environment_variables.environment_variables_dict.get('namespace', '')
1273+
try:
1274+
if label:
1275+
result = self.run(
1276+
cmd=f"{self._cli} get pods -n {namespace} -l {label} -o jsonpath='{{.items[0].{field}}}'")
1277+
else:
1278+
result = self.run(
1279+
cmd=f"{self._cli} get pod -n {namespace} {pod_name} -o jsonpath='{{.{field}}}'")
1280+
return result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
1281+
except Exception:
1282+
return ''
1283+
1284+
def get_pod_ip(self, label: str = '', pod_name: str = '', namespace: str = '') -> str:
1285+
return self._get_pod_field('status.podIP', label=label, pod_name=pod_name, namespace=namespace)
1286+
1287+
def get_pod_node(self, label: str = '', pod_name: str = '', namespace: str = '') -> str:
1288+
return self._get_pod_field('spec.nodeName', label=label, pod_name=pod_name, namespace=namespace)
1289+
1290+
def get_vmi_ip(self, namespace: str, vm_name: str, retries: int = 30) -> str:
1291+
"""
1292+
Get the IP address of a VirtualMachineInstance, retrying until available
1293+
"""
1294+
for attempt in range(retries):
1295+
try:
1296+
result = self.run(
1297+
cmd=f"{self._cli} get vmi -n {namespace} {vm_name} -o jsonpath='{{.status.interfaces[0].ipAddress}}'")
1298+
ip = result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
1299+
if ip and ip != '<none>':
1300+
return ip
1301+
except Exception:
1302+
pass
1303+
time.sleep(2)
1304+
return ''
1305+
1306+
def get_cluster_name(self) -> str:
1307+
"""
1308+
Get the cluster name/ID
1309+
"""
1310+
try:
1311+
result = self.run(cmd=f"{self._cli} get infrastructure cluster -o jsonpath='{{.status.infrastructureName}}'")
1312+
return result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
1313+
except Exception:
1314+
return ''
1315+
1316+
12481317
@logger_time_stamp
12491318
def __verify_vm_log_complete(self, vm_name: str, timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
12501319
"""
@@ -1452,19 +1521,26 @@ def create_vm_sync(self, yaml: str, vm_name: str,
14521521

14531522
@typechecked
14541523
@logger_time_stamp
1455-
def delete_vm_sync(self, yaml: str, vm_name: str,
1524+
def delete_vm_sync(self, yaml: str = '', vm_name: str = '',
14561525
namespace: str = environment_variables.environment_variables_dict['namespace'],
14571526
timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
14581527
"""
1459-
This method deletes specified VM synchronously; return False if it does not exist
1528+
This method deletes specified VM synchronously; return False if it does not exist.
1529+
Can delete by YAML file or by VM name directly.
1530+
:param yaml: YAML file to delete (deletes all resources in YAML)
1531+
:param vm_name: VM name to delete (used when no YAML, or to delete specific VM)
14601532
:param namespace:
14611533
:param timeout:
1462-
:param vm_name:
1463-
:param yaml:
14641534
:return: return False if vm does not exist
14651535
"""
14661536
if self.vm_exists(vm_name=vm_name, namespace=namespace):
1467-
self.delete_async(yaml)
1537+
if yaml:
1538+
self.delete_async(yaml)
1539+
else:
1540+
try:
1541+
self.run(f"{self._cli} delete vm {vm_name} -n {namespace} --ignore-not-found")
1542+
except Exception:
1543+
pass
14681544
return self.wait_for_vm_delete(vm_name=vm_name, namespace=namespace, timeout=timeout)
14691545
else:
14701546
return False
@@ -1492,9 +1568,19 @@ def wait_for_vm_completed(self, workload: str = '', vm_name: str = '',
14921568
current_wait_time = 0
14931569
namespace = f'-n {namespace}' if namespace else ''
14941570
while timeout <= 0 or current_wait_time <= timeout:
1495-
if self.run(
1496-
f"{self._cli} {namespace} get benchmark {workload} -o jsonpath={{.status.complete}}") == 'true':
1497-
return True
1571+
# Check VMI phase for direct VM workloads
1572+
if vm_name:
1573+
vmi_phase = self.run(
1574+
f"{self._cli} {namespace} get vmi {vm_name} -o jsonpath={{.status.phase}}")
1575+
if vmi_phase == 'Succeeded':
1576+
return True
1577+
elif vmi_phase == 'Failed':
1578+
return False
1579+
else:
1580+
# Fallback to benchmark CR for operator-based workloads
1581+
if self.run(
1582+
f"{self._cli} {namespace} get benchmark {workload} -o jsonpath={{.status.complete}}") == 'true':
1583+
return True
14981584
# sleep for x seconds
14991585
time.sleep(OC.SLEEP_TIME)
15001586
current_wait_time += OC.SLEEP_TIME
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: stressng-workload-{{ trunc_uuid }}
5+
namespace: {{ namespace }}
6+
data:
7+
jobfile: |
8+
run {{ runtype }}
9+
verbose
10+
metrics-brief
11+
timeout {{ stressng_timeout }}
12+
13+
# cpu stressor
14+
{%- if cpu_stressors is defined %}
15+
cpu {{ cpu_stressors }}
16+
cpu-load {{ cpu_percentage }}
17+
cpu-method {{ cpu_method }}
18+
{%- endif %}
19+
20+
# vm stressor
21+
{%- if vm_stressors is defined %}
22+
vm {{ vm_stressors }}
23+
vm-bytes {{ vm_bytes }}
24+
vm-keep
25+
vm-populate
26+
{%- endif %}
27+
28+
# memcpy stressor
29+
{%- if mem_stressors is defined %}
30+
memcpy {{ mem_stressors }}
31+
{%- endif %}
32+
---
33+
apiVersion: batch/v1
34+
kind: Job
35+
metadata:
36+
name: stressng-{{ kind }}-{{ trunc_uuid }}
37+
namespace: {{ namespace }}
38+
spec:
39+
parallelism: {{ instances }}
40+
backoffLimit: 0
41+
activeDeadlineSeconds: {{ job_timeout|default(3600) }}
42+
template:
43+
metadata:
44+
labels:
45+
app: stressng_workload-{{ trunc_uuid }}
46+
type: stressng-bench-workload-{{ trunc_uuid }}
47+
benchmark-uuid: {{ uuid }}
48+
benchmark-runner-workload: stressng
49+
spec:
50+
{%- if pin == 'true' or pin == true %}
51+
nodeSelector:
52+
kubernetes.io/hostname: '{{ pin_node }}'
53+
{%- endif %}
54+
{%- if kind == 'kata' %}
55+
runtimeClassName: kata
56+
{%- endif %}
57+
containers:
58+
- name: stressng
59+
{%- if resources == 'true' or resources == true %}
60+
resources:
61+
requests:
62+
cpu: {{ requests_cpu }}
63+
memory: {{ requests_memory }}
64+
limits:
65+
cpu: {{ limits_cpu }}
66+
memory: {{ limits_memory }}
67+
{%- endif %}
68+
image: {{ image | default('quay.io/benchmark-runner/stressng:latest') }}
69+
imagePullPolicy: Always
70+
env:
71+
- name: uuid
72+
value: "{{ uuid }}"
73+
- name: test_user
74+
value: "{{ test_user | default('user') }}"
75+
- name: clustername
76+
value: "{{ clustername | default('') }}"
77+
- name: runtype
78+
value: "{{ runtype }}"
79+
- name: timeout
80+
value: "{{ stressng_timeout }}"
81+
{%- if cpu_stressors is defined %}
82+
- name: cpu_stressors
83+
value: "{{ cpu_stressors }}"
84+
{%- endif %}
85+
{%- if cpu_percentage is defined %}
86+
- name: cpu_percentage
87+
value: "{{ cpu_percentage }}"
88+
{%- endif %}
89+
{%- if cpu_method is defined %}
90+
- name: cpu_method
91+
value: "{{ cpu_method }}"
92+
{%- endif %}
93+
{%- if vm_stressors is defined %}
94+
- name: vm_stressors
95+
value: "{{ vm_stressors }}"
96+
{%- endif %}
97+
{%- if vm_bytes is defined %}
98+
- name: vm_bytes
99+
value: "{{ vm_bytes }}"
100+
{%- endif %}
101+
{%- if mem_stressors is defined %}
102+
- name: mem_stressors
103+
value: "{{ mem_stressors }}"
104+
{%- endif %}
105+
command: ["/bin/sh", "-c"]
106+
args:
107+
- |
108+
set -e
109+
cd /tmp
110+
stress-ng --job /workload/jobfile --log-file /tmp/stressng.log -Y /tmp/stressng.yml || exit 1
111+
# Parse results into JSON (visible in pod logs, uploaded by benchmark-runner)
112+
if [ -f /tmp/stressng.yml ] && python3 -c 'import yaml' 2>/dev/null; then
113+
python3 << 'PYEOF'
114+
import yaml, json, os
115+
from datetime import datetime, timezone
116+
try:
117+
with open("/tmp/stressng.yml") as f:
118+
d = yaml.safe_load(f)
119+
except Exception:
120+
d = {}
121+
metrics = d.get("metrics", [])
122+
doc = {
123+
"workload": "stressng",
124+
"kind": "pod",
125+
"runtype": os.environ.get("runtype", ""),
126+
"timeout": int(os.environ.get("timeout", 0) or 0),
127+
"vm_stressors": os.environ.get("vm_stressors", ""),
128+
"vm_bytes": os.environ.get("vm_bytes", ""),
129+
"mem_stressors": os.environ.get("mem_stressors", ""),
130+
"cpu_method": os.environ.get("cpu_method", ""),
131+
}
132+
for m in metrics:
133+
s = m.get("stressor", "")
134+
b = m.get("bogo-ops", 0)
135+
doc[s] = b
136+
if s == "cpu": doc["cpu_bogomips"] = b
137+
elif s == "vm": doc["vm_bogomips"] = b
138+
bogo_total = sum(doc.get(s, 0) for s in ["cpu", "vm", "mem", "memcpy"])
139+
doc["bogo_ops"] = bogo_total
140+
print(json.dumps(doc))
141+
PYEOF
142+
fi
143+
volumeMounts:
144+
- name: stressng-workload-volume
145+
mountPath: "/workload"
146+
readOnly: false
147+
volumes:
148+
- name: stressng-workload-volume
149+
configMap:
150+
name: stressng-workload-{{ trunc_uuid }}
151+
defaultMode: 0660
152+
restartPolicy: Never

0 commit comments

Comments
 (0)