feat: e2e tests for max error

AlonKellner-Jounce · AlonKellner-Jounce · commit 041aa14e8fc9 · 2025-07-21T21:08:47.000Z
diff --git a/.gitignore b/.gitignore
@@ -178,3 +178,5 @@ cython_debug/
 # Project specific files
 *.json
 *.yaml
+/bin
+uv.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ dependencies = [
     "protobuf",
     "pydantic>=2.0.0",
     "pydantic-settings>=2.0.0",
+    "pytest-timeout[dev]>=2.4.0",
     "pyyaml>=6.0.0",
     "rich",
     "transformers",
@@ -78,6 +79,7 @@ dev = [
     "pytest-cov~=5.0.0",
     "pytest-mock~=3.14.0",
     "pytest-rerunfailures~=14.0",
+    "pytest-timeout~=2.4.0",
     "respx~=0.22.0",
 
     # code quality
diff --git a/src/guidellm/config.py b/src/guidellm/config.py
@@ -113,7 +113,7 @@ class Settings(BaseSettings):
     default_async_loop_sleep: float = 10e-5
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
-    shutdown_poll_interval_seconds: float = 10
+    shutdown_poll_interval_seconds: float = 1
     error_check_window_size: int = 10
 
     # HTTP settings
diff --git a/tests/e2e/README.md b/tests/e2e/README.md
@@ -0,0 +1,5 @@
+# E2E tests
+The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:
+```shell
+docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
+```
diff --git a/tests/e2e/test_basic.py b/tests/e2e/test_basic.py
@@ -0,0 +1,60 @@
+# test_server_interaction.py
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+from loguru import logger
+
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b")
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_basic_report(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/benchmarks.json")
+    rate = 10
+    command = f"""
+guidellm benchmark \
+  --target "{server.get_url()}" \
+  --rate-type constant \
+  --rate {rate} \
+  --max-seconds 1 \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --output-path {report_path}
+              """
+    logger.info(f"Client command: {command}") # guidellm benchmark   --target "http://127.0.0.1:8000"   --rate-type constant   --rate 5   --max-seconds 5   --data "prompt_tokens=256,output_tokens=128"
+    os.system(command)  # noqa: S605
+
+    assert report_path.exists()
+    with report_path.open("r") as f:
+        report = json.load(f)
+
+    assert "benchmarks" in report
+    benchmarks = report["benchmarks"]
+    assert len(benchmarks) > 0
+    benchmark = benchmarks[0]
+    assert "requests" in benchmark
+    requests = benchmark["requests"]
+    assert "successful" in requests
+    successful = requests["successful"]
+    assert len(successful) == rate
+
+    report_path.unlink()
diff --git a/tests/e2e/test_interrupted.py b/tests/e2e/test_interrupted.py
@@ -0,0 +1,88 @@
+# test_server_interaction.py
+
+import json
+import os
+from pathlib import Path
+import subprocess
+import time
+
+
+import pytest
+from loguru import logger
+
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b")
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_interrupted_report(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/benchmarks.json")
+    rate = 10
+    max_error_rate = 0.1
+    command = f"""guidellm benchmark \
+  --target "{server.get_url()}" \
+  --rate-type constant \
+  --rate {rate} \
+  --max-seconds 1000 \
+  --max-error {max_error_rate} \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --output-path {report_path}
+              """
+    logger.info(f"Client command: {command}") # guidellm benchmark   --target "http://127.0.0.1:8000"   --rate-type constant   --rate 5   --max-seconds 5   --data "prompt_tokens=256,output_tokens=128"
+    process = subprocess.Popen(["/bin/bash", "-c", command],  # noqa: S603
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True)
+    logger.info("Waiting for client to start...")
+    time.sleep(5)
+    server.stop()
+    logger.info("Waiting for client to stop...")
+    time.sleep(5)
+
+    logger.info("Fetching client output")
+    stdout, stderr = process.communicate()
+    logger.info(f"Client stdout:\n{stdout}")
+    logger.info(f"Client stderr:\n{stderr}")
+
+    assert report_path.exists()
+    with report_path.open("r") as f:
+        report = json.load(f)
+
+    assert "benchmarks" in report
+    benchmarks = report["benchmarks"]
+    assert len(benchmarks) > 0
+    benchmark = benchmarks[0]
+    assert "requests" in benchmark
+    requests = benchmark["requests"]
+    assert "successful" in requests
+    successful = requests["successful"]
+    assert "errored" in requests
+    errored = requests["errored"]
+    assert len(errored) / (len(successful) + len(errored)) > max_error_rate
+
+    report_path.unlink()
+
+    process.terminate()  # Send SIGTERM
+    try:
+        process.wait(timeout=5)  # Wait for the process to terminate
+        logger.info("Client stopped successfully.")
+    except subprocess.TimeoutExpired:
+        logger.warning("Client did not terminate gracefully, killing it...")
+        process.kill()  # Send SIGKILL if it doesn't terminate
+        process.wait()
diff --git a/tests/e2e/test_placeholder.py b/tests/e2e/test_placeholder.py
diff --git a/tests/e2e/vllm-sim.Dockerfile b/tests/e2e/vllm-sim.Dockerfile
@@ -0,0 +1,12 @@
+FROM golang AS base
+
+WORKDIR /app
+
+RUN git clone https://github.com/llm-d/llm-d-inference-sim.git && \
+    cd llm-d-inference-sim && \
+    make build
+
+WORKDIR /app/llm-d-inference-sim
+
+FROM scratch
+COPY --from=base /app/llm-d-inference-sim/bin/llm-d-inference-sim /bin/llm-d-inference-sim
diff --git a/tests/e2e/vllm_sim_server.py b/tests/e2e/vllm_sim_server.py
@@ -0,0 +1,130 @@
+import subprocess
+import time
+
+import pytest
+import requests
+from loguru import logger
+
+
+class VllmSimServer:
+    """
+    A class to manage the lifecycle of a test server.
+    Encapsulates starting, checking health, and stopping the server process.
+    """
+
+    def __init__(
+        self,
+        port: int,
+        model: str,
+        lora: list[str] | None = None,
+        mode: str | None = None,
+        echo: bool | None = None,
+        random: bool | None = None,
+        time_to_first_token: float | None = None,
+        inter_token_latency: float | None = None,
+        max_loras: int | None = None,
+        max_cpu_loras: int | None = None,
+        max_running_requests: int | None = None,
+    ):
+        self.port = port
+        self.model = model
+        self.lora = lora
+        self.mode = mode
+        self.echo = echo
+        self.random = random
+        self.time_to_first_token = time_to_first_token
+        self.inter_token_latency = inter_token_latency
+        self.max_loras = max_loras
+        self.max_cpu_loras = max_cpu_loras
+        self.max_running_requests = max_running_requests
+        self.server_url = f"http://127.0.0.1:{self.port}"
+        self.health_url = f"{self.server_url}/health"
+        self.app_script = "./bin/llm-d-inference-sim"
+        self.process = None
+
+    def get_cli_parameters(self) -> list[str]:
+        parameters = [
+            "--port", f"{self.port}",
+            "--model", self.model
+        ]
+        if self.lora is not None:
+            parameters.extend(["--lora", ",".join(self.lora)])
+        if self.mode is not None:
+            parameters.extend(["--mode", self.mode])
+        if self.echo is not None:
+            parameters.extend(["--echo"])
+        if self.random is not None:
+            parameters.extend(["--random"])
+        if self.time_to_first_token is not None:
+            parameters.extend(["--time-to-first-token", f"{self.time_to_first_token}"])
+        if self.inter_token_latency is not None:
+            parameters.extend(["--inter-token-latency", f"{self.inter_token_latency}"])
+        if self.max_loras is not None:
+            parameters.extend(["--max-loras", f"{self.max_loras}"])
+        if self.max_cpu_loras is not None:
+            parameters.extend(["--max-cpu-loras", f"{self.max_cpu_loras}"])
+        if self.max_running_requests is not None:
+            parameters.extend(["--max-running-requests", f"{self.max_running_requests}"])
+        return parameters
+
+    def start(self):
+        """
+        Starts the server process and waits for it to become healthy.
+        """
+
+        logger.info(f"Starting server on {self.server_url}"
+                    f" using {self.app_script}...")
+        cli_parameters = self.get_cli_parameters()
+        command = " ".join([self.app_script, *cli_parameters])
+        logger.info(f"Server command: {command}") # ./bin/llm-d-inference-sim --model databricks/dolly-v2-12b --port 8000
+        self.process = subprocess.Popen(  # noqa: S603
+            [self.app_script, *cli_parameters],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,  # Decode stdout/stderr as text
+        )
+
+        # Wait for the server to start and become healthy
+        max_retries = 20
+        retry_delay_sec = 0.5
+        for i in range(max_retries):
+            try:
+                response = requests.get(self.health_url, timeout=1)
+                if response.status_code == 200:
+                    logger.info(f"Server started successfully at {self.server_url}")
+                    return
+                else:
+                    logger.warning(f"Got response with status: {response.status_code}")
+                    logger.warning(response.json())
+            except requests.ConnectionError:
+                logger.warning(f"Waiting for server... (attempt {i + 1}/{max_retries})")
+                time.sleep(retry_delay_sec)
+        # If the loop completes without breaking, the server didn't start
+        stdout, stderr = self.process.communicate()
+        logger.error(f"Server failed to start after {max_retries} retries.")
+        logger.error(f"Server stdout:\n{stdout}")
+        logger.error(f"Server stderr:\n{stderr}")
+        self.stop()  # Attempt to clean up
+        pytest.fail("Server did not start within the expected time.")
+
+    def stop(self):
+        """
+        Stops the server process.
+        """
+        if self.process:
+            logger.info(f"Stopping server on {self.server_url}...")
+            self.process.terminate()  # Send SIGTERM
+            try:
+                self.process.wait(timeout=5)  # Wait for the process to terminate
+                logger.info("Server stopped successfully.")
+            except subprocess.TimeoutExpired:
+                logger.warning("Server did not terminate gracefully, killing it...")
+                self.process.kill()  # Send SIGKILL if it doesn't terminate
+                self.process.wait()
+            self.process = None  # Clear the process reference
+
+    def get_url(self):
+        """
+        Returns the base URL of the running server.
+        """
+        return self.server_url

-Original file line number
+Diff line change
 # Project specific files
 *.json
 *.yaml
 +/bin
 +uv.lock