redhat-performance
diff --git a/‎README.md‎
Lines changed: 3 additions & 6 deletions b/‎README.md‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎benchmark_runner/common/oc/oc.py‎
Lines changed: 111 additions & 25 deletions b/‎benchmark_runner/common/oc/oc.py‎
Lines changed: 111 additions & 25 deletions
diff --git a/‎benchmark_runner/common/template_operations/templates/stressng/internal_data/stressng_pod_job_template.yaml‎
Lines changed: 152 additions & 0 deletions b/‎benchmark_runner/common/template_operations/templates/stressng/internal_data/stressng_pod_job_template.yaml‎
Lines changed: 152 additions & 0 deletions
@@ -74,7 +74,7 @@ Choose one from the following list:
 
 Not mandatory:
 
-**auto:** NAMESPACE=benchmark-operator [ The default namespace is benchmark-operator ]
+**auto:** NAMESPACE=benchmark-runner [ The default namespace is benchmark-runner ]
 
 **auto:** ODF_PVC=True [ True=ODF PVC storage, False=Ephemeral storage, default True ]
 
@@ -84,8 +84,6 @@ Not mandatory:
 
 **auto:** RUNNER_PATH=/tmp [ The default work space is /tmp ]
 
-**optional:** PIN_NODE_BENCHMARK_OPERATOR=$PIN_NODE_BENCHMARK_OPERATOR [node selector for benchmark operator pod]
-
 **optional:** PIN_NODE1=$PIN_NODE1 [node1 selector for running the workload]
 
 **optional:** PIN_NODE2=$PIN_NODE2 [node2 selector for running the workload, i.e. uperf server and client, hammerdb database and workload]
@@ -111,17 +109,16 @@ Not mandatory:
 For example:
 
 ```sh
-podman run --rm -e WORKLOAD="hammerdb_pod_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE_BENCHMARK_OPERATOR="node_name-0" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
+podman run --rm -e WORKLOAD="hammerdb_pod_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
 ```
 or
 ```sh
-docker run --rm -e WORKLOAD="hammerdb_vm_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE_BENCHMARK_OPERATOR="node_name-0" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
+docker run --rm -e WORKLOAD="hammerdb_vm_mariadb" -e KUBEADMIN_PASSWORD="1234" -e PIN_NODE1="node_name-1" -e PIN_NODE2="node_name-2" -e log_level=INFO -v /root/.kube/config:/root/.kube/config --privileged quay.io/benchmark-runner/benchmark-runner:latest
 ```
 
 SAVE RUN ARTIFACTS LOCAL:
 1. add `-e SAVE_ARTIFACTS_LOCAL='True'` or `--save-artifacts-local=true`
 2. add `-v /tmp/benchmark-runner-run-artifacts:/tmp/benchmark-runner-run-artifacts`
-3. git clone -b v1.0.3 https://github.com/cloud-bulldozer/benchmark-operator /tmp/benchmark-operator
 
 ### Run vdbench workload in Pod using OpenShift
 ![](media/benchmark-runner-demo.gif)
 
@@ -802,18 +802,27 @@ def collect_events(self):
 
     @typechecked
     @logger_time_stamp
-    def get_pod(self, label: str, database: str = '', namespace: str = environment_variables.environment_variables_dict['namespace']):
+    def get_pod(self, label: str = '', database: str = '', namespace: str = environment_variables.environment_variables_dict['namespace'], label_selector: str = ''):
         """
-        This method gets pods according to label
-        :param label:
+        This method gets pod name by name pattern or label selector
+        :param label: pod name pattern (grep match)
         :param database:
         :param namespace:
-        :return:
+        :param label_selector: Kubernetes label selector (e.g. 'app=stressng_workload-<uuid>')
+        :return: pod name
         """
         if database:
             return self.run(
                 f"{self._cli} get pods -n '{database}-db'" + " --no-headers | awk '{ print $1; }' | grep " + database,
                 is_check=True).rstrip().decode('ascii')
+        elif label_selector:
+            namespace_opt = f'-n {namespace}' if namespace else ''
+            result = self.run(
+                f"{self._cli} get pods {namespace_opt} -l '{label_selector}' -o jsonpath='{{.items[0].metadata.name}}'",
+                is_check=True)
+            if isinstance(result, bytes):
+                return result.decode('utf-8').strip().strip("'")
+            return str(result).strip().strip("'") if result else ''
         else:
             namespace = f'-n {namespace}' if namespace else ''
             return self.run(f"{self._cli} get pods {namespace} --no-headers | awk '{{ print $1; }}' | grep -w '{label}'", is_check=True).rstrip().decode('ascii')
@@ -862,26 +871,27 @@ def get_pods(self):
 
     @typechecked
     @logger_time_stamp
-    def wait_for_pod_create(self, pod_name: str,
+    def wait_for_pod_create(self, pod_name: str = '', label: str = '',
                             namespace: str = environment_variables.environment_variables_dict['namespace'],
                             timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
         """
-        This method waits till pod name is creating or throw exception after timeout
+        This method waits till pod is created or throws exception after timeout.
+        Can match by pod_name or label selector (for Job pods with random suffixes).
+        :param pod_name: Pod name to match
+        :param label: Label selector to match (e.g. 'app=stressng_workload-<uuid>')
         :param namespace:
-        :param pod_name:
         :param timeout:
-        :return: True if getting pod name or raise PodNameError
+        :return: True if pod found or raise PodNotCreateTimeout
         """
         current_wait_time = 0
         while timeout <= 0 or current_wait_time <= timeout:
-            if self.pod_exists(pod_name=pod_name, namespace=namespace):
-                self.describe_pod(pod_name=pod_name, namespace=namespace)
+            if label and self.pod_label_exists(label_name=label, namespace=namespace):
+                return True
+            elif pod_name and self.pod_exists(pod_name=pod_name, namespace=namespace):
                 return True
-            # sleep for x seconds
             time.sleep(OC.SLEEP_TIME)
             current_wait_time += OC.SLEEP_TIME
-        self.describe_pod(pod_name=pod_name, namespace=namespace)
-        raise PodNotCreateTimeout(pod_name)
+        raise PodNotCreateTimeout(pod_name or label)
 
     @typechecked
     @logger_time_stamp
@@ -1163,13 +1173,23 @@ def wait_for_pod_completed(self, label: str, workload: str = '', label_uuid: boo
                         f"{self._cli} {namespace} wait --for=condition=failed -l {label}-{self.__get_short_uuid(workload=workload)} jobs --timeout={OC.SLEEP_TIME}s")
                     if 'met' in result:
                         return False
-                if not job:
+                elif job:
+                    # Handle job=True with label_uuid=False (direct pod workloads)
+                    result = self.run(
+                        f"{self._cli} {namespace} wait --for=condition=complete -l {label} jobs --timeout={OC.SHORT_TIMEOUT}s")
+                    if 'met' in result:
+                        return True
+                    result = self.run(
+                        f"{self._cli} {namespace} wait --for=condition=failed -l {label} jobs --timeout={OC.SLEEP_TIME}s")
+                    if 'met' in result:
+                        return False
+                elif not job:
                     result = self.run(f"{self._cli} get pod -l {label}" + " -n benchmark-runner --no-headers | awk '{ print $3; }'")
                     if 'Completed' in result:
                         return True
-            # sleep for x seconds
-            time.sleep(OC.SLEEP_TIME)
-            current_wait_time += OC.SLEEP_TIME
+                # sleep for x seconds
+                time.sleep(OC.SLEEP_TIME)
+                current_wait_time += OC.SLEEP_TIME
         except Exception as err:
             raise PodNotCompletedTimeout(workload=workload)
 
@@ -1245,6 +1265,55 @@ def get_vm(self, label: str = '', namespace: str = environment_variables.environ
         else:
             return self.run(f'{self._cli} get vmi', is_check=True)
 
+    def _get_pod_field(self, field: str, label: str = '', pod_name: str = '', namespace: str = '') -> str:
+        """
+        Get a pod field via jsonpath, by label selector or pod name
+        """
+        namespace = namespace or environment_variables.environment_variables_dict.get('namespace', '')
+        try:
+            if label:
+                result = self.run(
+                    cmd=f"{self._cli} get pods -n {namespace} -l {label} -o jsonpath='{{.items[0].{field}}}'")
+            else:
+                result = self.run(
+                    cmd=f"{self._cli} get pod -n {namespace} {pod_name} -o jsonpath='{{.{field}}}'")
+            return result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
+        except Exception:
+            return ''
+
+    def get_pod_ip(self, label: str = '', pod_name: str = '', namespace: str = '') -> str:
+        return self._get_pod_field('status.podIP', label=label, pod_name=pod_name, namespace=namespace)
+
+    def get_pod_node(self, label: str = '', pod_name: str = '', namespace: str = '') -> str:
+        return self._get_pod_field('spec.nodeName', label=label, pod_name=pod_name, namespace=namespace)
+
+    def get_vmi_ip(self, namespace: str, vm_name: str, retries: int = 30) -> str:
+        """
+        Get the IP address of a VirtualMachineInstance, retrying until available
+        """
+        for attempt in range(retries):
+            try:
+                result = self.run(
+                    cmd=f"{self._cli} get vmi -n {namespace} {vm_name} -o jsonpath='{{.status.interfaces[0].ipAddress}}'")
+                ip = result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
+                if ip and ip != '<none>':
+                    return ip
+            except Exception:
+                pass
+            time.sleep(2)
+        return ''
+
+    def get_cluster_name(self) -> str:
+        """
+        Get the cluster name/ID
+        """
+        try:
+            result = self.run(cmd=f"{self._cli} get infrastructure cluster -o jsonpath='{{.status.infrastructureName}}'")
+            return result.strip().strip(b"'").decode('ascii') if isinstance(result, bytes) else str(result).strip().strip("'")
+        except Exception:
+            return ''
+
+
     @logger_time_stamp
     def __verify_vm_log_complete(self, vm_name: str, timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
         """
@@ -1452,19 +1521,26 @@ def create_vm_sync(self, yaml: str, vm_name: str,
 
     @typechecked
     @logger_time_stamp
-    def delete_vm_sync(self, yaml: str, vm_name: str,
+    def delete_vm_sync(self, yaml: str = '', vm_name: str = '',
                        namespace: str = environment_variables.environment_variables_dict['namespace'],
                        timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
         """
-        This method deletes specified VM synchronously; return False if it does not exist
+        This method deletes specified VM synchronously; return False if it does not exist.
+        Can delete by YAML file or by VM name directly.
+        :param yaml: YAML file to delete (deletes all resources in YAML)
+        :param vm_name: VM name to delete (used when no YAML, or to delete specific VM)
         :param namespace:
         :param timeout:
-        :param vm_name:
-        :param yaml:
         :return: return False if vm does not exist
         """
         if self.vm_exists(vm_name=vm_name, namespace=namespace):
-            self.delete_async(yaml)
+            if yaml:
+                self.delete_async(yaml)
+            else:
+                try:
+                    self.run(f"{self._cli} delete vm {vm_name} -n {namespace} --ignore-not-found")
+                except Exception:
+                    pass
             return self.wait_for_vm_delete(vm_name=vm_name, namespace=namespace, timeout=timeout)
         else:
             return False
@@ -1492,9 +1568,19 @@ def wait_for_vm_completed(self, workload: str = '', vm_name: str = '',
         current_wait_time = 0
         namespace = f'-n {namespace}' if namespace else ''
         while timeout <= 0 or current_wait_time <= timeout:
-            if self.run(
-                    f"{self._cli} {namespace} get benchmark {workload} -o jsonpath={{.status.complete}}") == 'true':
-                return True
+            # Check VMI phase for direct VM workloads
+            if vm_name:
+                vmi_phase = self.run(
+                    f"{self._cli} {namespace} get vmi {vm_name} -o jsonpath={{.status.phase}}")
+                if vmi_phase == 'Succeeded':
+                    return True
+                elif vmi_phase == 'Failed':
+                    return False
+            else:
+                # Fallback to benchmark CR for operator-based workloads
+                if self.run(
+                        f"{self._cli} {namespace} get benchmark {workload} -o jsonpath={{.status.complete}}") == 'true':
+                    return True
             # sleep for x seconds
             time.sleep(OC.SLEEP_TIME)
             current_wait_time += OC.SLEEP_TIME
 
@@ -0,0 +1,152 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: stressng-workload-{{ trunc_uuid }}
+  namespace: {{ namespace }}
+data:
+  jobfile: |
+    run {{ runtype }}
+    verbose
+    metrics-brief
+    timeout {{ stressng_timeout }}
+
+    # cpu stressor
+    {%- if cpu_stressors is defined %}
+    cpu {{ cpu_stressors }}
+    cpu-load {{ cpu_percentage }}
+    cpu-method {{ cpu_method }}
+    {%- endif %}
+
+    # vm stressor
+    {%- if vm_stressors is defined %}
+    vm {{ vm_stressors }}
+    vm-bytes {{ vm_bytes }}
+    vm-keep
+    vm-populate
+    {%- endif %}
+
+    # memcpy stressor
+    {%- if mem_stressors is defined %}
+    memcpy {{ mem_stressors }}
+    {%- endif %}
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: stressng-{{ kind }}-{{ trunc_uuid }}
+  namespace: {{ namespace }}
+spec:
+  parallelism: {{ instances }}
+  backoffLimit: 0
+  activeDeadlineSeconds: {{ job_timeout|default(3600) }}
+  template:
+    metadata:
+      labels:
+        app: stressng_workload-{{ trunc_uuid }}
+        type: stressng-bench-workload-{{ trunc_uuid }}
+        benchmark-uuid: {{ uuid }}
+        benchmark-runner-workload: stressng
+    spec:
+      {%- if pin == 'true' or pin == true %}
+      nodeSelector:
+        kubernetes.io/hostname: '{{ pin_node }}'
+      {%- endif %}
+      {%- if kind == 'kata' %}
+      runtimeClassName: kata
+      {%- endif %}
+      containers:
+      - name: stressng
+        {%- if resources == 'true' or resources == true %}
+        resources:
+          requests:
+            cpu: {{ requests_cpu }}
+            memory: {{ requests_memory }}
+          limits:
+            cpu: {{ limits_cpu }}
+            memory: {{ limits_memory }}
+        {%- endif %}
+        image: {{ image | default('quay.io/benchmark-runner/stressng:latest') }}
+        imagePullPolicy: Always
+        env:
+          - name: uuid
+            value: "{{ uuid }}"
+          - name: test_user
+            value: "{{ test_user | default('user') }}"
+          - name: clustername
+            value: "{{ clustername | default('') }}"
+          - name: runtype
+            value: "{{ runtype }}"
+          - name: timeout
+            value: "{{ stressng_timeout }}"
+          {%- if cpu_stressors is defined %}
+          - name: cpu_stressors
+            value: "{{ cpu_stressors }}"
+          {%- endif %}
+          {%- if cpu_percentage is defined %}
+          - name: cpu_percentage
+            value: "{{ cpu_percentage }}"
+          {%- endif %}
+          {%- if cpu_method is defined %}
+          - name: cpu_method
+            value: "{{ cpu_method }}"
+          {%- endif %}
+          {%- if vm_stressors is defined %}
+          - name: vm_stressors
+            value: "{{ vm_stressors }}"
+          {%- endif %}
+          {%- if vm_bytes is defined %}
+          - name: vm_bytes
+            value: "{{ vm_bytes }}"
+          {%- endif %}
+          {%- if mem_stressors is defined %}
+          - name: mem_stressors
+            value: "{{ mem_stressors }}"
+          {%- endif %}
+        command: ["/bin/sh", "-c"]
+        args:
+          - |
+            set -e
+            cd /tmp
+            stress-ng --job /workload/jobfile --log-file /tmp/stressng.log -Y /tmp/stressng.yml || exit 1
+            # Parse results into JSON (visible in pod logs, uploaded by benchmark-runner)
+            if [ -f /tmp/stressng.yml ] && python3 -c 'import yaml' 2>/dev/null; then
+              python3 << 'PYEOF'
+            import yaml, json, os
+            from datetime import datetime, timezone
+            try:
+                with open("/tmp/stressng.yml") as f:
+                    d = yaml.safe_load(f)
+            except Exception:
+                d = {}
+            metrics = d.get("metrics", [])
+            doc = {
+                "workload": "stressng",
+                "kind": "pod",
+                "runtype": os.environ.get("runtype", ""),
+                "timeout": int(os.environ.get("timeout", 0) or 0),
+                "vm_stressors": os.environ.get("vm_stressors", ""),
+                "vm_bytes": os.environ.get("vm_bytes", ""),
+                "mem_stressors": os.environ.get("mem_stressors", ""),
+                "cpu_method": os.environ.get("cpu_method", ""),
+            }
+            for m in metrics:
+                s = m.get("stressor", "")
+                b = m.get("bogo-ops", 0)
+                doc[s] = b
+                if s == "cpu": doc["cpu_bogomips"] = b
+                elif s == "vm": doc["vm_bogomips"] = b
+            bogo_total = sum(doc.get(s, 0) for s in ["cpu", "vm", "mem", "memcpy"])
+            doc["bogo_ops"] = bogo_total
+            print(json.dumps(doc))
+            PYEOF
+            fi
+        volumeMounts:
+        - name: stressng-workload-volume
+          mountPath: "/workload"
+          readOnly: false
+      volumes:
+      - name: stressng-workload-volume
+        configMap:
+          name: stressng-workload-{{ trunc_uuid }}
+          defaultMode: 0660
+      restartPolicy: Never