openshift-psap
diff --git a/‎docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst‎
Lines changed: 2 additions & 18 deletions b/‎docs/toolbox.generated/Llmd.run_guidellm_benchmark.rst‎
Lines changed: 2 additions & 18 deletions
diff --git a/‎docs/toolbox.generated/index.rst‎
Lines changed: 0 additions & 1 deletion b/‎docs/toolbox.generated/index.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎projects/llm-d/testing/config.yaml‎
Lines changed: 19 additions & 14 deletions b/‎projects/llm-d/testing/config.yaml‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎projects/llm-d/testing/llmisvcs/llama-3-1-8b-instruct-fp8.yaml‎
Lines changed: 0 additions & 1 deletion b/‎projects/llm-d/testing/llmisvcs/llama-3-1-8b-instruct-fp8.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎projects/llm-d/testing/test_llmd.py‎
Lines changed: 86 additions & 49 deletions b/‎projects/llm-d/testing/test_llmd.py‎
Lines changed: 86 additions & 49 deletions
diff --git a/‎projects/llm-d/toolbox/llmd.py‎
Lines changed: 3 additions & 32 deletions b/‎projects/llm-d/toolbox/llmd.py‎
Lines changed: 3 additions & 32 deletions
diff --git a/‎projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml‎
Lines changed: 2 additions & 8 deletions b/‎projects/llm-d/toolbox/llmd_run_guidellm_benchmark/defaults/main/config.yml‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎projects/llm-d/toolbox/llmd_run_guidellm_benchmark/templates/guidellm_benchmark_job.yaml.j2‎
Lines changed: 3 additions & 5 deletions b/‎projects/llm-d/toolbox/llmd_run_guidellm_benchmark/templates/guidellm_benchmark_job.yaml.j2‎
Lines changed: 3 additions & 5 deletions
@@ -56,23 +56,7 @@ Parameters
 * default value: ``900``
 
 
-``rate``  
+``guidellm_args``  
 
-* Request rate for the benchmark
-
-* default value: ``1``
-
-
-``max_seconds``  
-
-* Maximum seconds to run benchmark
-
-* default value: ``30``
-
-
-``data``  
-
-* Data configuration
-
-* default value: ``prompt_tokens=256,output_tokens=128``
+* List of additional guidellm arguments (e.g., ["--rate=10", "--max-seconds=30"])
 
@@ -191,7 +191,6 @@ Toolbox Documentation
 * :doc:`deploy_gateway <Llmd.deploy_gateway>`	 Deploys a GatewayClass and Gateway object
 * :doc:`deploy_llm_inference_service <Llmd.deploy_llm_inference_service>`	 Deploys an LLM InferenceService from a YAML file
 * :doc:`run_guidellm_benchmark <Llmd.run_guidellm_benchmark>`	 Runs a Guidellm benchmark job against the LLM inference service
-* :doc:`run_multiturn_benchmark <Llmd.run_multiturn_benchmark>`	 Runs a multi-turn benchmark job against the LLM inference service
 
 ``local_ci``
 ************
 
@@ -30,24 +30,33 @@ ci_presets:
                 operator: NotIn
                 values:
                 - gf48e48
+                - gf4334a
     prepare.preload.extra_images:
       vllm-cuda-rhel9: registry.redhat.io/rhaiis/vllm-cuda-rhel9@sha256:094db84a1da5e8a575d0c9eade114fa30f4a2061064a338e3e032f3578f8082a
       llm-d-inference-scheduler: ghcr.io/opendatahub-io/rhaii-on-xks/llm-d-inference-scheduler:e6b5db0@sha256:43e8b8edc158f31535c8b23d77629f8cde111cc762a8f4ee5f2f884470566211
       guidellm: ghcr.io/vllm-project/guidellm:v0.5.4
 
   multi-flavor:
-    tests.llmd.flavors: [simple-tp8-x2, intelligentrouting-x2-tp8, simple, simple-x2, simple-tp8, intelligentrouting-tp8]
+    tests.llmd.flavors: [simple-tp4, simple-tp2-x4, intelligentrouting-tp2-x4]
 
   guidellm_light:
     tests.llmd.benchmarks.guidellm.data: prompt_tokens=256,output_tokens=128
-    tests.llmd.benchmarks.guidellm.rate: 1,10,50
+    tests.llmd.benchmarks.guidellm.rate: "1,10,50"
     tests.llmd.benchmarks.guidellm.max_seconds: 30
 
   guidellm_multiturn_eval:
-    tests.llmd.benchmarks.guidellm.data: prompt_tokens=8000,prompt_tokens_stdev=4500,prompt_tokens_min=50,prompt_tokens_max=30000,output_tokens=800,output_tokens_stdev=1500,output_tokens_min=20,output_tokens_max=8000
-    tests.llmd.benchmarks.guidellm.rate: 1,10,50,100,200,300
+    tests.llmd.benchmarks.guidellm.data: "prompt_tokens=128,output_tokens=128,turns=5,prefix_tokens=10000,prefix_count={2*rate}"
+    tests.llmd.benchmarks.guidellm.rate: [32, 64, 128, 256, 512] # keep as a list, multi-rate not supporte by guidellm-multiturn
+    tests.llmd.benchmarks.guidellm.max_requests: "{10*rate}"
+
+  guidellm_heterogeneous_eval:
+    tests.llmd.benchmarks.guidellm.data: prompt_tokens=8000,prompt_tokens_stdev=8500,prompt_tokens_min=50,prompt_tokens_max=30000,output_tokens=800,output_tokens_stdev=1500,output_tokens_min=20,output_tokens_max=8000
+    tests.llmd.benchmarks.guidellm.rate: "1,10,50,100,200,300"
     tests.llmd.benchmarks.guidellm.max_seconds: 600
 
+  gpt-oss:
+    tests.llmd.inference_service.model: gpt-oss-120
+
 clusters:
   cleanup_on_exit: false
 
@@ -181,7 +190,7 @@ tests:
 
     inference_service:
       skip_deployment: false
-      name: llama-llm-d
+      name: llm-d
       yaml_file: llama-3-1-8b-instruct-fp8.yaml
       timeout: 900
       do_simple_test: true
@@ -196,7 +205,7 @@ tests:
         - "--trust-remote-code"
         - "--disable-log-requests"
         - "--max-model-len=40960"
-        - "--gpu-memory-utilization=0.9"
+        - "--gpu-memory-utilization=0.92"
 
       kueue:
         enabled: false
@@ -211,17 +220,13 @@ tests:
       extra_properties: {}
 
     benchmarks:
-      multiturn:
-        enabled: false
-        name: multiturn-benchmark
-        parallel: 9
-        timeout: 900
-
       guidellm:
         enabled: true
         name: guidellm-benchmark
-        rate-type: concurrent
-        max_seconds: 60
+        backend_type: openai_http
+        rate_type: concurrent
+        max_seconds: null
+        max_requests: null
         timeout: 900
         data: prompt_tokens=256,output_tokens=128
         rate: 1
 
@@ -49,7 +49,6 @@ spec:
                 apiVersion: inference.networking.x-k8s.io/v1alpha1
                 kind: EndpointPickerConfig
                 plugins:
-                - type: single-profile-handler
                 - type: queue-scorer
                 - type: kv-cache-utilization-scorer
                 - type: prefix-cache-scorer
 
@@ -67,9 +67,6 @@ def test_single_flavor(flavor, flavor_index, total_flavors, namespace):
                     raise RuntimeError("Simple inference test failed :/")
 
             # Run benchmarks
-            if config.project.get_config("tests.llmd.benchmarks.multiturn.enabled"):
-                flavor_failed |= run_multiturn_benchmark(endpoint_url, llmisvc_name, namespace)
-
             if config.project.get_config("tests.llmd.benchmarks.guidellm.enabled"):
                 flavor_failed |= run_guidellm_benchmark(endpoint_url, llmisvc_name, namespace)
 
@@ -804,74 +801,114 @@ def get_llm_inference_url(llmisvc_name, namespace, flavor):
         return endpoint_url
 
 
-def run_multiturn_benchmark(endpoint_url, llmisvc_name, namespace):
+def run_guidellm_benchmark(endpoint_url, llmisvc_name, namespace):
     """
-    Runs the multi-turn benchmark
+    Runs the Guidellm benchmark
     """
 
-    if not config.project.get_config("tests.llmd.benchmarks.multiturn.enabled"):
+    if not config.project.get_config("tests.llmd.benchmarks.guidellm.enabled"):
         return False
 
-    logging.info("Running multi-turn benchmark")
+    logging.info("Running Guidellm benchmark")
 
-    benchmark_name = config.project.get_config("tests.llmd.benchmarks.multiturn.name")
-    parallel = config.project.get_config("tests.llmd.benchmarks.multiturn.parallel")
-    timeout = config.project.get_config("tests.llmd.benchmarks.multiturn.timeout")
+    benchmark_name = config.project.get_config("tests.llmd.benchmarks.guidellm.name")
+    rate = config.project.get_config("tests.llmd.benchmarks.guidellm.rate")
+    backend_type = config.project.get_config("tests.llmd.benchmarks.guidellm.backend_type")
+    rate_type = config.project.get_config("tests.llmd.benchmarks.guidellm.rate_type")
+    max_seconds = config.project.get_config("tests.llmd.benchmarks.guidellm.max_seconds")
+    max_requests = config.project.get_config("tests.llmd.benchmarks.guidellm.max_requests")
+    timeout = config.project.get_config("tests.llmd.benchmarks.guidellm.timeout")
+    data = config.project.get_config("tests.llmd.benchmarks.guidellm.data")
 
     failed = False
 
-    endpoint_url = f"{endpoint_url}/v1"
+    # Handle rate as list/tuple - iterate over each rate value
+    if isinstance(rate, (list, tuple)):
+        rate_values = rate
+    else:
+        rate_values = [rate]
 
-    try:
-        run.run_toolbox("llmd", "run_multiturn_benchmark",
-                       endpoint_url=endpoint_url,
-                       name=benchmark_name,
-                       namespace=namespace,
-                       parallel=parallel,
-                       timeout=timeout)
+    def apply_rate_scaleup(value, rate):
+        """
+        Apply rate-based scaling to configuration values.
 
-        logging.info("Multi-turn benchmark completed successfully")
+        Evaluates expressions like:
+        - "{10*rate}" with rate=32 -> "320"
+        - "prefix_count={2*rate}" with rate=32 -> "prefix_count=64"
+        """
+        if not isinstance(value, str):
+            return value
 
-    except Exception as e:
-        logging.error(f"Multi-turn benchmark failed: {e}")
-        failed = True
+        import re
 
-    return failed
+        # Find all expressions in curly braces
+        pattern = r'\{([^}]+)\}'
 
+        def evaluate_expression(match):
+            expression = match.group(1)
+            try:
+                # Create a safe evaluation context with only 'rate' variable
+                context = {"rate": rate}
+                result = eval(expression, {"__builtins__": {}}, context)
+                return str(result)
+            except Exception as e:
+                logging.warning(f"Failed to evaluate expression '{expression}' with rate={rate}: {e}")
+                return match.group(0)  # Return original if evaluation fails
+
+        # Replace all expressions with their evaluated results
+        return re.sub(pattern, evaluate_expression, value)
+
+    for rate_value in rate_values:
+        try:
+            logging.info(f"Running Guidellm benchmark with rate: {rate_value}")
 
-def run_guidellm_benchmark(endpoint_url, llmisvc_name, namespace):
-    """
-    Runs the Guidellm benchmark
-    """
+            # Create unique name for each rate if multiple rates
+            current_name = benchmark_name
+            if len(rate_values) > 1:
+                current_name = f"{benchmark_name}-rate-{rate_value}"
 
-    if not config.project.get_config("tests.llmd.benchmarks.guidellm.enabled"):
-        return False
+            # Construct guidellm arguments list
+            guidellm_args = []
 
-    logging.info("Running Guidellm benchmark")
+            # Add default parameters from config
+            if backend_type:
+                guidellm_args.append(f"--backend-type={backend_type}")
 
-    benchmark_name = config.project.get_config("tests.llmd.benchmarks.guidellm.name")
-    rate = config.project.get_config("tests.llmd.benchmarks.guidellm.rate")
-    max_seconds = config.project.get_config("tests.llmd.benchmarks.guidellm.max_seconds")
-    timeout = config.project.get_config("tests.llmd.benchmarks.guidellm.timeout")
-    data = config.project.get_config("tests.llmd.benchmarks.guidellm.data")
+            if rate_type:
+                guidellm_args.append(f"--rate-type={rate_type}")
 
-    failed = False
+            # Add rate parameter
+            guidellm_args.append(f"--rate={rate_value}")
 
-    try:
-        run.run_toolbox("llmd", "run_guidellm_benchmark",
-                       endpoint_url=endpoint_url,
-                       name=benchmark_name,
-                       namespace=namespace,
-                       rate=rate,
-                       max_seconds=max_seconds,
-                       timeout=timeout,
-                       data=data)
+            # Add optional parameters if provided
+            if max_seconds is not None:
+                guidellm_args.append(f"--max-seconds={max_seconds}")
 
-        logging.info("Guidellm benchmark completed successfully")
+            if max_requests is not None:
+                guidellm_args.append(f"--max-requests={apply_rate_scaleup(max_requests, rate_value)}")
 
-    except Exception as e:
-        logging.error(f"Guidellm benchmark failed: {e}")
-        failed = True
+            # Add data parameter
+            if data:
+                guidellm_args.append(f"--data={apply_rate_scaleup(data, rate_value)}")
+
+            suffix = f"_rate{rate_value}" if len(rate_values) > 1\
+                else None
+
+            run.run_toolbox(
+                "llmd", "run_guidellm_benchmark",
+                endpoint_url=endpoint_url,
+                name=current_name,
+                namespace=namespace,
+                timeout=timeout,
+                guidellm_args=guidellm_args,
+                artifact_dir_suffix=suffix,
+            )
+
+            logging.info(f"Guidellm benchmark completed successfully for rate: {rate_value}")
+
+        except Exception as e:
+            logging.error(f"Guidellm benchmark failed for rate {rate_value}: {e}")
+            failed = True
 
     return failed
 
 
@@ -46,39 +46,15 @@ def deploy_llm_inference_service(self, name, namespace, yaml_file):
 
         return RunAnsibleRole(locals())
 
-    @AnsibleRole("llmd_run_multiturn_benchmark")
-    @AnsibleMappedParams
-    def run_multiturn_benchmark(
-            self,
-            endpoint_url,
-            name="multi-turn-benchmark", namespace="",
-            image="quay.io/hayesphilip/multi-turn-benchmark", version="0.0.1",
-            timeout=900, parallel=9
-    ):
-        """
-        Runs a multi-turn benchmark job against the LLM inference service
-
-        Args:
-          endpoint_url: Endpoint URL for the LLM inference service to benchmark
-          name: Name of the benchmark job
-          namespace: Namespace to run the benchmark job in (empty string auto-detects current namespace)
-          image: Container image for the benchmark
-          version: Version tag for the benchmark image
-          timeout: Timeout in seconds to wait for job completion
-          parallel: Number of parallel connections
-        """
-
-        return RunAnsibleRole(locals())
-
     @AnsibleRole("llmd_run_guidellm_benchmark")
     @AnsibleMappedParams
     def run_guidellm_benchmark(
             self,
             endpoint_url,
             name="guidellm-benchmark", namespace="",
             image="ghcr.io/vllm-project/guidellm", version="pr-590",
-            timeout=900, rate=1, max_seconds=30,
-            data="prompt_tokens=256,output_tokens=128"
+            timeout=900,
+            guidellm_args=[],
     ):
         """
         Runs a Guidellm benchmark job against the LLM inference service
@@ -90,14 +66,9 @@ def run_guidellm_benchmark(
           image: Container image for the benchmark
           version: Version tag for the benchmark image
           timeout: Timeout in seconds to wait for job completion
-          rate: Request rate for the benchmark
-          max_seconds: Maximum seconds to run benchmark
-          data: Data configuration
+          guidellm_args: List of additional guidellm arguments (e.g., ["--rate=10", "--max-seconds=30"])
         """
 
-        if isinstance(rate, tuple):
-            rate = ",".join(map(str, rate))
-
         return RunAnsibleRole(locals())
 
     @AnsibleRole("llmd_capture_isvc_state")
 
@@ -22,14 +22,8 @@ llmd_run_guidellm_benchmark_version: pr-590
 # Timeout in seconds to wait for job completion
 llmd_run_guidellm_benchmark_timeout: 900
 
-# Request rate for the benchmark
-llmd_run_guidellm_benchmark_rate: 1
-
-# Maximum seconds to run benchmark
-llmd_run_guidellm_benchmark_max_seconds: 30
-
-# Data configuration
-llmd_run_guidellm_benchmark_data: prompt_tokens=256,output_tokens=128
+# List of additional guidellm arguments (e.g., ["--rate=10", "--max-seconds=30"])
+llmd_run_guidellm_benchmark_guidellm_args: []
 
 # Default Ansible variables
 # Default value for ansible_os_family to ensure role remains standalone
 
@@ -18,11 +18,9 @@ spec:
             - benchmark
             - run
             - --target={{ benchmark_endpoint_url }}
-            - --backend-type=openai_http
-            - --rate-type=concurrent
-            - --rate={{ llmd_run_guidellm_benchmark_rate }}
-            - --max-seconds={{ llmd_run_guidellm_benchmark_max_seconds }}
-            - --data={{ llmd_run_guidellm_benchmark_data }}
+{% for arg in llmd_run_guidellm_benchmark_guidellm_args %}
+            - {{ arg }}
+{% endfor %}
             - --outputs=json
           env:
             - name: USER # version 0.6.0-pr590 currently needs that ...