diff --git a/.gitignore b/.gitignore index e0a29972..f1654067 100644 --- a/.gitignore +++ b/.gitignore @@ -230,3 +230,6 @@ src/ui/next-env.d.ts !src/ui/public/manifest.json !src/ui/serve.json .eslintcache + +# vllm-sim +bin/ diff --git a/tests/e2e/README.md b/tests/e2e/README.md new file mode 100644 index 00000000..c29c148d --- /dev/null +++ b/tests/e2e/README.md @@ -0,0 +1,12 @@ +# E2E tests + +The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command: + +```shell +docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./ +``` + +Then to run the tests: +```shell +tox -e test-e2e +``` diff --git a/tests/e2e/test_max_error_benchmark.py b/tests/e2e/test_max_error_benchmark.py new file mode 100644 index 00000000..6079b21c --- /dev/null +++ b/tests/e2e/test_max_error_benchmark.py @@ -0,0 +1,72 @@ +# E2E test for max error rate constraint functionality + +from pathlib import Path + +import pytest + +from tests.e2e.utils import ( + GuidellmClient, + assert_constraint_triggered, + assert_no_python_exceptions, + cleanup_report_file, + load_benchmark_report, +) +from tests.e2e.vllm_sim_server import VllmSimServer + + +@pytest.fixture(scope="module") +def server(): + """ + Pytest fixture to start and stop the server for the entire module + using the TestServer class. + """ + server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo") + try: + server.start() + yield server # Yield the URL for tests to use + finally: + server.stop() # Teardown: Stop the server after tests are done + + +@pytest.mark.timeout(30) +def test_max_error_benchmark(server: VllmSimServer): + """ + Test that the max error rate constraint is properly triggered when server goes down. + """ + report_path = Path("tests/e2e/max_error_benchmarks.json") + rate = 10 + max_error_rate = 0.1 + + # Create and configure the guidellm client + client = GuidellmClient(target=server.get_url(), output_path=report_path) + + try: + # Start the benchmark + client.start_benchmark( + rate=rate, + max_seconds=25, + max_error_rate=max_error_rate, + ) + + # Wait for the benchmark to complete (server will be stopped after 10 seconds) + client.wait_for_completion(timeout=30, stop_server_after=10, server=server) + + # Assert no Python exceptions occurred + assert_no_python_exceptions(client.stderr) + + # Load and validate the report + report = load_benchmark_report(report_path) + benchmark = report["benchmarks"][0] + + # Check that the max error rate constraint was triggered + assert_constraint_triggered( + benchmark, + "max_error_rate", + { + "exceeded_error_rate": True, + "current_error_rate": lambda rate: rate >= max_error_rate, + }, + ) + + finally: + cleanup_report_file(report_path) diff --git a/tests/e2e/test_placeholder.py b/tests/e2e/test_placeholder.py deleted file mode 100644 index 0d35031c..00000000 --- a/tests/e2e/test_placeholder.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.smoke -def test_placeholder(): - assert True diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py new file mode 100644 index 00000000..8f0181a3 --- /dev/null +++ b/tests/e2e/test_successful_benchmark.py @@ -0,0 +1,120 @@ +# E2E tests for successful benchmark scenarios with timing validation + +from pathlib import Path + +import pytest + +from tests.e2e.utils import ( + GuidellmClient, + assert_constraint_triggered, + assert_no_python_exceptions, + assert_successful_requests_fields, + cleanup_report_file, + load_benchmark_report, +) +from tests.e2e.vllm_sim_server import VllmSimServer + + +@pytest.fixture(scope="module") +def server(): + """ + Pytest fixture to start and stop the server for the entire module + using the TestServer class. + """ + server = VllmSimServer( + port=8000, + model="databricks/dolly-v2-12b", + mode="echo", + time_to_first_token=1, # 1ms TTFT + inter_token_latency=1, # 1ms ITL + ) + try: + server.start() + yield server # Yield the URL for tests to use + finally: + server.stop() # Teardown: Stop the server after tests are done + + +@pytest.mark.timeout(30) +def test_max_seconds_benchmark(server: VllmSimServer): + """ + Test that the max seconds constraint is properly triggered. + """ + report_path = Path("tests/e2e/max_duration_benchmarks.json") + rate = 10 + + # Create and configure the guidellm client + client = GuidellmClient(target=server.get_url(), output_path=report_path) + + try: + # Start the benchmark + client.start_benchmark( + rate=rate, + max_seconds=1, + ) + + # Wait for the benchmark to complete + client.wait_for_completion(timeout=30) + + # Assert no Python exceptions occurred + assert_no_python_exceptions(client.stderr) + + # Load and validate the report + report = load_benchmark_report(report_path) + benchmark = report["benchmarks"][0] + + # Check that the max duration constraint was triggered + assert_constraint_triggered( + benchmark, "max_seconds", {"duration_exceeded": True} + ) + + # Validate successful requests have all expected fields + successful_requests = benchmark["requests"]["successful"] + assert_successful_requests_fields(successful_requests) + + finally: + cleanup_report_file(report_path) + + +@pytest.mark.timeout(30) +def test_max_requests_benchmark(server: VllmSimServer): + """ + Test that the max requests constraint is properly triggered. + """ + report_path = Path("tests/e2e/max_number_benchmarks.json") + rate = 10 + + # Create and configure the guidellm client + client = GuidellmClient(target=server.get_url(), output_path=report_path) + + try: + # Start the benchmark + client.start_benchmark( + rate=rate, + max_requests=rate, + ) + + # Wait for the benchmark to complete + client.wait_for_completion(timeout=30) + + # Assert no Python exceptions occurred + assert_no_python_exceptions(client.stderr) + + # Load and validate the report + report = load_benchmark_report(report_path) + benchmark = report["benchmarks"][0] + + # Check that the max requests constraint was triggered + assert_constraint_triggered( + benchmark, "max_requests", {"processed_exceeded": True} + ) + + # Validate successful requests have all expected fields + successful_requests = benchmark["requests"]["successful"] + assert len(successful_requests) == rate, ( + f"Expected {rate} successful requests, got {len(successful_requests)}" + ) + assert_successful_requests_fields(successful_requests) + + finally: + cleanup_report_file(report_path) diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py new file mode 100644 index 00000000..9357949c --- /dev/null +++ b/tests/e2e/utils.py @@ -0,0 +1,327 @@ +"""Utilities for E2E tests.""" + +import json +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional + +from loguru import logger + + +def get_guidellm_executable() -> str: + """Get the path to the guidellm executable in the current environment.""" + # Get the directory where the current Python executable is located + python_bin_dir = Path(sys.executable).parent + guidellm_path = python_bin_dir / "guidellm" + if guidellm_path.exists(): + return str(guidellm_path) + else: + # Fallback to just "guidellm" if not found + return "guidellm" + + +class GuidellmClient: + """Wrapper class for running guidellm benchmark commands.""" + + def __init__(self, target: str, output_path: Path): + """ + Initialize the guidellm client. + + :param target: The target URL for the benchmark + :param output_path: Path where the benchmark report will be saved + """ + self.target = target + self.output_path = output_path + self.process: Optional[subprocess.Popen] = None + self.stdout: Optional[str] = None + self.stderr: Optional[str] = None + + def start_benchmark( + self, + rate_type: str = "constant", + rate: int = 10, + max_seconds: Optional[int] = None, + max_requests: Optional[int] = None, + max_error_rate: Optional[float] = None, + data: str = "prompt_tokens=256,output_tokens=128", + processor: str = "gpt2", + additional_args: str = "", + ) -> None: + """ + Start a guidellm benchmark command. + + :param rate_type: Type of rate control (constant, etc.) + :param rate: Request rate + :param max_seconds: Maximum duration in seconds + :param max_requests: Maximum number of requests + :param max_error_rate: Maximum error rate before stopping + :param data: Data configuration string + :param processor: Processor/tokenizer to use + :param additional_args: Additional command line arguments + """ + guidellm_exe = get_guidellm_executable() + + # Build command components + cmd_parts = [ + f"GUIDELLM__MAX_CONCURRENCY=10 GUIDELLM__MAX_WORKER_PROCESSES=10 {guidellm_exe} benchmark", + f'--target "{self.target}"', + f"--rate-type {rate_type}", + f"--rate {rate}", + ] + + if max_seconds is not None: + cmd_parts.append(f"--max-seconds {max_seconds}") + + if max_requests is not None: + cmd_parts.append(f"--max-requests {max_requests}") + + if max_error_rate is not None: + cmd_parts.append(f"--max-error-rate {max_error_rate}") + + cmd_parts.extend( + [ + f'--data "{data}"', + f'--processor "{processor}"', + f"--output-path {self.output_path}", + ] + ) + + if additional_args: + cmd_parts.append(additional_args) + + command = " \\\n ".join(cmd_parts) + + logger.info(f"Client command: {command}") + + self.process = subprocess.Popen( # noqa: S603 + ["/bin/bash", "-c", command], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + def wait_for_completion( + self, timeout: int = 30, stop_server_after: Optional[int] = None, server=None + ) -> None: + """ + Wait for the benchmark to complete. + + :param timeout: Maximum time to wait for completion + :param stop_server_after: If provided, stop the server after this many seconds + :param server: Server object to stop (if stop_server_after is provided) + """ + if self.process is None: + raise RuntimeError("No process started. Call start_benchmark() first.") + + if stop_server_after is not None and server is not None: + logger.info( + f"Waiting {stop_server_after} seconds before stopping server..." + ) + time.sleep(stop_server_after) + server.stop() + + try: + logger.info("Fetching client output") + self.stdout, self.stderr = self.process.communicate(timeout=timeout) + logger.debug(f"Client stdout:\n{self.stdout}") + logger.debug(f"Client stderr:\n{self.stderr}") + + except subprocess.TimeoutExpired: + logger.warning("Client did not complete within timeout, terminating...") + self.process.terminate() + try: + self.stdout, self.stderr = self.process.communicate(timeout=5) + except subprocess.TimeoutExpired: + logger.warning("Client did not terminate gracefully, killing it...") + self.process.kill() + self.stdout, self.stderr = self.process.communicate() + finally: + if self.process and self.process.poll() is None: + self.process.terminate() + try: + self.process.wait(timeout=5) + logger.info("Client stopped successfully.") + except subprocess.TimeoutExpired: + logger.warning("Client did not terminate gracefully, killing it...") + self.process.kill() + self.process.wait() + + +def assert_no_python_exceptions(stderr: Optional[str]) -> None: + """ + Assert that stderr does not contain any Python exception indicators. + + :param stderr: The stderr string to check (can be None) + :raises AssertionError: If Python exceptions are detected + """ + if stderr is None: + return # No stderr to check + + python_exception_indicators = [ + "Traceback (most recent call last):", + "AttributeError:", + "ValueError:", + "TypeError:", + "KeyError:", + "IndexError:", + "NameError:", + "ImportError:", + "RuntimeError:", + ] + + for indicator in python_exception_indicators: + assert indicator not in stderr, ( + f"Python exception detected in stderr: {indicator}" + ) + + +def load_benchmark_report(report_path: Path) -> dict: + """ + Load and validate a benchmark report JSON file. + + :param report_path: Path to the report file + :return: The loaded report dictionary + :raises AssertionError: If the file doesn't exist or is invalid + """ + assert report_path.exists(), f"Report file does not exist: {report_path}" + + with report_path.open("r") as f: + report = json.load(f) + + assert "benchmarks" in report, "Report missing 'benchmarks' field" + benchmarks = report["benchmarks"] + assert len(benchmarks) > 0, "Report contains no benchmarks" + + return report + + +def assert_successful_requests_fields(successful_requests: list) -> None: + """ + Assert that successful requests contain all expected timing and token fields. + + :param successful_requests: List of successful request objects + :raises AssertionError: If required fields are missing or invalid + """ + assert len(successful_requests) >= 1, "No successful requests found" + + for request in successful_requests: + # Basic latency + assert "request_latency" in request, "Missing 'request_latency' field" + assert request["request_latency"] > 0, "request_latency should be > 0" + + # Streaming timing fields + assert "time_to_first_token_ms" in request, ( + "Missing 'time_to_first_token_ms' field" + ) + assert request["time_to_first_token_ms"] is not None, ( + "time_to_first_token_ms should not be None" + ) + assert request["time_to_first_token_ms"] > 0, ( + "time_to_first_token_ms should be > 0" + ) + + assert "time_per_output_token_ms" in request, ( + "Missing 'time_per_output_token_ms' field" + ) + assert request["time_per_output_token_ms"] is not None, ( + "time_per_output_token_ms should not be None" + ) + assert request["time_per_output_token_ms"] > 0, ( + "time_per_output_token_ms should be > 0" + ) + + assert "inter_token_latency_ms" in request, ( + "Missing 'inter_token_latency_ms' field" + ) + assert request["inter_token_latency_ms"] is not None, ( + "inter_token_latency_ms should not be None" + ) + assert request["inter_token_latency_ms"] > 0, ( + "inter_token_latency_ms should be > 0" + ) + + # Token throughput fields + assert "tokens_per_second" in request, "Missing 'tokens_per_second' field" + assert request["tokens_per_second"] > 0, "tokens_per_second should be > 0" + + assert "output_tokens_per_second" in request, ( + "Missing 'output_tokens_per_second' field" + ) + assert request["output_tokens_per_second"] > 0, ( + "output_tokens_per_second should be > 0" + ) + + # Token count fields + assert "total_tokens" in request, "Missing 'total_tokens' field" + assert request["total_tokens"] > 0, "total_tokens should be > 0" + + assert "prompt_tokens" in request, "Missing 'prompt_tokens' field" + assert request["prompt_tokens"] > 0, "prompt_tokens should be > 0" + + assert "output_tokens" in request, "Missing 'output_tokens' field" + assert request["output_tokens"] > 0, "output_tokens should be > 0" + + +def assert_constraint_triggered( + benchmark: dict, constraint_name: str, expected_metadata: dict +) -> None: + """ + Assert that a specific constraint was triggered with expected metadata. + + :param benchmark: The benchmark object + :param constraint_name: Name of the constraint (e.g., 'max_seconds', 'max_requests', 'max_error_rate') + :param expected_metadata: Dictionary of expected metadata fields and values + :raises AssertionError: If constraint was not triggered or metadata is incorrect + """ + assert "scheduler" in benchmark, "Benchmark missing 'scheduler' field" + scheduler = benchmark["scheduler"] + + assert "state" in scheduler, "Scheduler missing 'state' field" + state = scheduler["state"] + + assert "end_processing_constraints" in state, ( + "State missing 'end_processing_constraints' field" + ) + constraints = state["end_processing_constraints"] + + assert constraint_name in constraints, ( + f"Constraint '{constraint_name}' was not triggered" + ) + constraint = constraints[constraint_name] + + assert "metadata" in constraint, ( + f"Constraint '{constraint_name}' missing 'metadata' field" + ) + metadata = constraint["metadata"] + + for key, expected_value in expected_metadata.items(): + assert key in metadata, ( + f"Constraint '{constraint_name}' metadata missing '{key}' field" + ) + actual_value = metadata[key] + + if isinstance(expected_value, bool): + assert actual_value is expected_value, ( + f"Expected {key}={expected_value}, got {actual_value}" + ) + elif callable(expected_value): + # Allow callable predicates for complex validation + assert expected_value(actual_value), ( + f"Predicate failed for {key}={actual_value}" + ) + else: + assert actual_value == expected_value, ( + f"Expected {key}={expected_value}, got {actual_value}" + ) + + +def cleanup_report_file(report_path: Path) -> None: + """ + Clean up the report file if it exists. + + :param report_path: Path to the report file to remove + """ + if report_path.exists(): + report_path.unlink() diff --git a/tests/e2e/vllm-sim.Dockerfile b/tests/e2e/vllm-sim.Dockerfile new file mode 100644 index 00000000..63be0fbd --- /dev/null +++ b/tests/e2e/vllm-sim.Dockerfile @@ -0,0 +1,15 @@ +FROM golang AS base + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y libzmq3-dev pkg-config && \ + git clone https://github.com/llm-d/llm-d-inference-sim.git && \ + cd llm-d-inference-sim && \ + git checkout v0.3.0 && \ + make build + +WORKDIR /app/llm-d-inference-sim + +FROM scratch +COPY --from=base /app/llm-d-inference-sim/bin /bin diff --git a/tests/e2e/vllm_sim_server.py b/tests/e2e/vllm_sim_server.py new file mode 100644 index 00000000..726dba40 --- /dev/null +++ b/tests/e2e/vllm_sim_server.py @@ -0,0 +1,136 @@ +import subprocess +import time +from pathlib import Path +from typing import Optional + +import pytest +import requests +from loguru import logger + + +class VllmSimServer: + """ + [vLLM simulator](https://llm-d.ai/docs/architecture/Components/inf-simulator) + A vLLM simulator wrapper for pytest. + """ + + def __init__( + self, + port: int, + model: str, + lora: Optional[list[str]] = None, + mode: Optional[str] = None, + echo: Optional[bool] = None, + random: Optional[bool] = None, + time_to_first_token: Optional[float] = None, + inter_token_latency: Optional[float] = None, + max_loras: Optional[int] = None, + max_cpu_loras: Optional[int] = None, + max_num_seqs: Optional[int] = None, + ): + self.port = port + self.model = model + self.lora = lora + self.mode = mode + self.echo = echo + self.random = random + self.time_to_first_token = time_to_first_token + self.inter_token_latency = inter_token_latency + self.max_loras = max_loras + self.max_cpu_loras = max_cpu_loras + self.max_num_seqs = max_num_seqs + self.server_url = f"http://127.0.0.1:{self.port}" + self.health_url = f"{self.server_url}/health" + self.app_script = "./bin/llm-d-inference-sim" + self.process: Optional[subprocess.Popen] = None + if not Path(self.app_script).exists(): + message = ( + "The vLLM simulator binary is required for E2E tests, but is missing.\n" + "To build it and enable E2E tests, please run:\n" + "docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./" + ) + logger.warning(message) + pytest.skip("vLLM simlator binary missing", allow_module_level=True) + + def get_cli_parameters(self) -> list[str]: + parameters = ["--port", f"{self.port}", "--model", self.model] + if self.lora is not None: + parameters.extend(["--lora", ",".join(self.lora)]) + if self.mode is not None: + parameters.extend(["--mode", self.mode]) + if self.echo is not None: + parameters.extend(["--echo"]) + if self.random is not None: + parameters.extend(["--random"]) + if self.time_to_first_token is not None: + parameters.extend(["--time-to-first-token", f"{self.time_to_first_token}"]) + if self.inter_token_latency is not None: + parameters.extend(["--inter-token-latency", f"{self.inter_token_latency}"]) + if self.max_loras is not None: + parameters.extend(["--max-loras", f"{self.max_loras}"]) + if self.max_cpu_loras is not None: + parameters.extend(["--max-cpu-loras", f"{self.max_cpu_loras}"]) + if self.max_num_seqs is not None: + parameters.extend(["--max-num-seqs", f"{self.max_num_seqs}"]) + return parameters + + def start(self): + """ + Starts the server process and waits for it to become healthy. + """ + + logger.info(f"Starting server on {self.server_url} using {self.app_script}...") + cli_parameters = self.get_cli_parameters() + command = " ".join([self.app_script, *cli_parameters]) + logger.info(f"Server command: {command}") + self.process = subprocess.Popen( # noqa: S603 + [self.app_script, *cli_parameters], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, # Decode stdout/stderr as text + ) + + # Wait for the server to start and become healthy + max_retries = 20 + retry_delay_sec = 0.5 + for i in range(max_retries): + try: + response = requests.get(self.health_url, timeout=1) + if response.status_code == 200: + logger.info(f"Server started successfully at {self.server_url}") + return + else: + logger.warning(f"Got response with status: {response.status_code}") + logger.warning(response.json()) + except requests.ConnectionError: + logger.warning(f"Waiting for server... (attempt {i + 1}/{max_retries})") + time.sleep(retry_delay_sec) + # If the loop completes without breaking, the server didn't start + stdout, stderr = self.process.communicate() + logger.error(f"Server failed to start after {max_retries} retries.") + logger.error(f"Server stdout:\n{stdout}") + logger.error(f"Server stderr:\n{stderr}") + self.stop() # Attempt to clean up + pytest.fail("Server did not start within the expected time.") + + def stop(self): + """ + Stops the server process. + """ + if self.process: + logger.info(f"Stopping server on {self.server_url}...") + self.process.terminate() # Send SIGTERM + try: + self.process.wait(timeout=1) # Wait for the process to terminate + logger.info("Server stopped successfully.") + except subprocess.TimeoutExpired: + logger.warning("Server did not terminate gracefully, killing it...") + self.process.kill() # Send SIGKILL if it doesn't terminate + self.process.wait() + self.process = None # Clear the process reference + + def get_url(self): + """ + Returns the base URL of the running server. + """ + return self.server_url diff --git a/tests/integration/scheduler/test_scheduler.py b/tests/integration/scheduler/test_scheduler.py index edff9e8f..51abf59b 100644 --- a/tests/integration/scheduler/test_scheduler.py +++ b/tests/integration/scheduler/test_scheduler.py @@ -88,7 +88,7 @@ async def resolve(self, request: MockRequest, request_info, request_history): ): raise RuntimeError(f"mock_error_for_{request.payload}") - yield f"response_for_{request.payload}" + yield f"response_for_{request.payload}", request_info @pytest.mark.smoke diff --git a/tests/integration/scheduler/test_worker_group.py b/tests/integration/scheduler/test_worker_group.py index 4c39f36d..c96f6dec 100644 --- a/tests/integration/scheduler/test_worker_group.py +++ b/tests/integration/scheduler/test_worker_group.py @@ -22,11 +22,11 @@ AsyncPoissonStrategy, BackendInterface, ConcurrentStrategy, - MaxDurationConstraintInitializer, - MaxErrorRateConstraintInitializer, - MaxErrorsConstraintInitializer, - MaxGlobalErrorRateConstraintInitializer, - MaxNumberConstraintInitializer, + MaxDurationConstraint, + MaxErrorRateConstraint, + MaxErrorsConstraint, + MaxGlobalErrorRateConstraint, + MaxNumberConstraint, MeasuredRequestTimings, SynchronousStrategy, ThroughputStrategy, @@ -98,7 +98,7 @@ async def resolve(self, request, request_info, request_history): ): raise RuntimeError("Mock error for testing") - yield f"response_for_{request}" + yield f"response_for_{request}", request_info class TestWorkerGroup: @@ -118,15 +118,11 @@ class TestWorkerGroup: @pytest.mark.parametrize( "constraints_inits", [ - {"max_num": MaxNumberConstraintInitializer(max_num=100)}, - {"max_duration": MaxDurationConstraintInitializer(max_duration=0.5)}, - {"max_errors": MaxErrorsConstraintInitializer(max_errors=20)}, - {"max_error_rate": MaxErrorRateConstraintInitializer(max_error_rate=0.1)}, - { - "max_global_error_rate": MaxGlobalErrorRateConstraintInitializer( - max_error_rate=0.1 - ) - }, + {"max_num": MaxNumberConstraint(max_num=100)}, + {"max_duration": MaxDurationConstraint(max_duration=0.5)}, + {"max_errors": MaxErrorsConstraint(max_errors=20)}, + {"max_error_rate": MaxErrorRateConstraint(max_error_rate=0.1)}, + {"max_global_error_rate": MaxGlobalErrorRateConstraint(max_error_rate=0.1)}, ], ) async def test_lifecycle( diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index 9076834b..d4d73aa0 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -10,7 +10,7 @@ from guidellm.benchmark import ( GenerativeBenchmarksReport, ) -from guidellm.benchmark.output import GenerativeBenchmarksConsole +from guidellm.benchmark.output import GenerativeBenchmarkerConsole from tests.unit.mock_benchmark import mock_generative_benchmark @@ -100,7 +100,7 @@ def test_file_csv(): def test_console_benchmarks_profile_str(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert ( @@ -109,7 +109,7 @@ def test_console_benchmarks_profile_str(): def test_console_benchmarks_args_str(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_args_str == ( @@ -119,14 +119,14 @@ def test_console_benchmarks_args_str(): def test_console_benchmarks_worker_desc_str(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_worker_desc_str == str(mock_benchmark.worker) def test_console_benchmarks_request_loader_desc_str(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_request_loader_desc_str == str( @@ -135,35 +135,35 @@ def test_console_benchmarks_request_loader_desc_str(): def test_console_benchmarks_extras_str(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_extras_str == "None" def test_console_print_section_header(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() with patch.object(console.console, "print") as mock_print: console.print_section_header("Test Header") mock_print.assert_called_once() def test_console_print_labeled_line(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() with patch.object(console.console, "print") as mock_print: console.print_labeled_line("Label", "Value") mock_print.assert_called_once() def test_console_print_line(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() with patch.object(console.console, "print") as mock_print: console.print_line("Test Line") mock_print.assert_called_once() def test_console_print_table(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() headers = ["Header1", "Header2"] rows = [["Row1Col1", "Row1Col2"], ["Row2Col1", "Row2Col2"]] with ( @@ -178,7 +178,7 @@ def test_console_print_table(): def test_console_print_benchmarks_metadata(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] with ( @@ -191,7 +191,7 @@ def test_console_print_benchmarks_metadata(): def test_console_print_benchmarks_info(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] with patch.object(console, "print_table") as mock_table: @@ -200,7 +200,7 @@ def test_console_print_benchmarks_info(): def test_console_print_benchmarks_stats(): - console = GenerativeBenchmarksConsole(enabled=True) + console = GenerativeBenchmarkerConsole() mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] with patch.object(console, "print_table") as mock_table: diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 511bacbf..d846767d 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -1,271 +1,152 @@ +"""Mock benchmark objects for unit testing.""" + +from guidellm.backend import GenerationRequestTimings from guidellm.benchmark import ( - BenchmarkArgs, BenchmarkSchedulerStats, GenerativeBenchmark, + GenerativeMetrics, GenerativeRequestStats, - GenerativeTextErrorStats, - SynchronousProfile, ) -from guidellm.objects import StatusBreakdown -from guidellm.request import GenerativeRequestLoaderDescription -from guidellm.scheduler import ( - GenerativeRequestsWorkerDescription, - SchedulerRequestInfo, - SynchronousStrategy, +from guidellm.benchmark.objects import BenchmarkerDict, SchedulerDict +from guidellm.benchmark.profile import SynchronousProfile +from guidellm.scheduler import ScheduledRequestInfo, SchedulerState, SynchronousStrategy +from guidellm.utils import ( + DistributionSummary, + Percentiles, + StandardBaseDict, + StatusBreakdown, + StatusDistributionSummary, ) __all__ = ["mock_generative_benchmark"] +def _create_mock_percentiles() -> Percentiles: + """Create mock percentiles for testing.""" + return Percentiles( + p001=0.1, + p01=1.0, + p05=5.0, + p10=10.0, + p25=25.0, + p50=50.0, + p75=75.0, + p90=90.0, + p95=95.0, + p99=99.0, + p999=99.9, + ) + + +def _create_mock_distribution() -> DistributionSummary: + """Create mock distribution summary for testing.""" + return DistributionSummary( + mean=50.0, + median=50.0, + mode=50.0, + variance=10.0, + std_dev=3.16, + min=10.0, + max=100.0, + count=100, + total_sum=5000.0, + percentiles=_create_mock_percentiles(), + ) + + +def _create_status_dist() -> StatusDistributionSummary: + """Create mock status distribution summary for testing.""" + dist = _create_mock_distribution() + return StatusDistributionSummary( + successful=dist, + incomplete=dist, + errored=dist, + total=dist, + ) + + def mock_generative_benchmark() -> GenerativeBenchmark: - return GenerativeBenchmark.from_stats( - run_id="fa4a92c1-9a1d-4c83-b237-83fcc7971bd3", - successful=[ - GenerativeRequestStats( - request_id="181a63e2-dc26-4268-9cfc-2ed9279aae63", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728125.203447, - queued_time=1744728125.204123, - dequeued_time=1744728125.2048807, - scheduled_time=1744728125.2048993, - worker_start=1744728125.2049701, - request_start=1744728125.2052872, - request_end=1744728126.7004411, - worker_end=1744728126.701175, - process_id=0, - ), - prompt="such a sacrifice to her advantage as years of gratitude cannot enough acknowledge. By this time she is actually with them! If such goodness does not make her miserable now, she will never deserve to be happy! What a meeting for her, when she first sees my aunt! We must endeavour to forget all that has passed on either side, said Jane I hope and trust they will yet be happy. His consenting to marry her is a proof, I will believe, that he is come to a right way of thinking. Their mutual affection will steady them; and I flatter myself they will settle so quietly, and live in so rational a manner", # noqa: E501 - output=", as to make their long life together very comfortable and very useful. I feel, if they and the honourable Mr. Thorpe, who still lives amongst us, should be all I need, I could perfectly rest happy. Writes to meet them in that kind of obedience which is necessary and honourable, and such", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728125.2052872, - end_time=1744728126.7004411, - first_token_time=1744728125.2473357, - last_token_time=1744728126.699908, - ), - GenerativeRequestStats( - request_id="8a7846d5-7624-420d-a269-831e568a848f", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728125.204613, - queued_time=1744728125.2047558, - dequeued_time=1744728126.7025175, - scheduled_time=1744728126.7025256, - worker_start=1744728126.702579, - request_start=1744728126.7027814, - request_end=1744728128.1961868, - worker_end=1744728128.196895, - process_id=0, - ), - prompt="a reconciliation; and, after a little further resistance on the part of his aunt, her resentment gave way, either to her affection for him, or her curiosity to see how his wife conducted herself; and she condescended to wait on them at Pemberley, in spite of that pollution which its woods had received, not merely from the presence of such a mistress, but the visits of her uncle and aunt from the city. With the Gardiners they were always on the most intimate terms. Darcy, as well as Elizabeth, really loved them; and they were both ever sensible of the warmest gratitude towards the persons who,", # noqa: E501 - output=" in their own days of poverty, had been so hotel and hospitable to a young couple leaving Pemberley. Till the size of Mr. Bennet\u2019s salary had been altered, the blessing of their friendship was much more greatly needed by the family than it appeared after that event.\n- Mr. Darcy soon deserved", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728126.7027814, - end_time=1744728128.1961868, - first_token_time=1744728126.7526379, - last_token_time=1744728128.1956792, - ), - GenerativeRequestStats( - request_id="4cde0e6c-4531-4e59-aac1-07bc8b6e4139", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728126.7031465, - queued_time=1744728126.7034643, - dequeued_time=1744728128.198447, - scheduled_time=1744728128.1984534, - worker_start=1744728128.198509, - request_start=1744728128.1986883, - request_end=1744728129.6919055, - worker_end=1744728129.692606, - process_id=0, - ), - prompt="struck her, that _she_ was selected from among her sisters as worthy of being the mistress of Hunsford Parsonage, and of assisting to form a quadrille table at Rosings, in the absence of more eligible visitors. The idea soon reached to conviction, as she observed his increasing civilities towards herself, and heard his frequent attempt at a compliment on her wit and vivacity; and though more astonished than gratified herself by this effect of her charms, it was not long before her mother gave her to understand that the probability of their marriage was exceedingly agreeable to _her_. Elizabeth, however, did not choose", # noqa: E501 - output=" to improve this conversation into a prophecy, and her mother would hardly take on herself to announce so important a phenomenon. At last he was to drive to Hunsford from Meryton on Sunday; they staid for an hour at eight o'clock, and the following day appeared to be hung up on the walls of", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728128.1986883, - end_time=1744728129.6919055, - first_token_time=1744728128.2481627, - last_token_time=1744728129.6914039, - ), - GenerativeRequestStats( - request_id="a95b96be-05d4-4130-b0dd-9528c01c9909", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728128.1987216, - queued_time=1744728128.1991177, - dequeued_time=1744728129.6953137, - scheduled_time=1744728129.695318, - worker_start=1744728129.695379, - request_start=1744728129.6955585, - request_end=1744728131.187553, - worker_end=1744728131.188169, - process_id=0, - ), - prompt="were comfortable on this subject. Day after day passed away without bringing any other tidings of him than the report which shortly prevailed in Meryton of his coming no more to Netherfield the whole winter; a report which highly incensed Mrs. Bennet, and which she never failed to contradict as a most scandalous falsehood. Even Elizabeth began to fear not that Bingley was indifferent but that his sisters would be successful in keeping him away. Unwilling as she was to admit an idea so destructive to Jane s happiness, and so dishonourable to the stability of her lover, she could not prevent its frequently recurring", # noqa: E501 - output=" during these indefinite disputes; and was often seriously engaged in blaming her sisters for increasing a suspense which might only be caused by their own inattention to a subject of so much moment. Whether she had really made that impression on the s+.ayers, or whether she had merely imagined it, she could decide no farther, for", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728129.6955585, - end_time=1744728131.187553, - first_token_time=1744728129.7438853, - last_token_time=1744728131.187019, - ), - GenerativeRequestStats( - request_id="714b751c-bbfe-4b2a-a0af-7c1bf2c224ae", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728129.6975086, - queued_time=1744728129.6978767, - dequeued_time=1744728131.190093, - scheduled_time=1744728131.190101, - worker_start=1744728131.1901798, - request_start=1744728131.1904676, - request_end=1744728132.6833503, - worker_end=1744728132.6839745, - process_id=0, - ), - prompt="? cried Elizabeth, brightening up for a moment. Upon my word, said Mrs. Gardiner, I begin to be of your uncle s opinion. It is really too great a violation of decency, honour, and interest, for him to be guilty of it. I cannot think so very ill of Wickham. Can you, yourself, Lizzie, so wholly give him up, as to believe him capable of it? Not perhaps of neglecting his own interest. But of every other neglect I can believe him capable. If, indeed, it should be so! But I dare not hope it. Why should they not go on", # noqa: E501 - output=" together? This is still a motive incapable of being denied. He has such a faculty of pleasing, and you know how much she likes him. \nQuestion: What made elder sisters the center of their families?\nSometimes early this would be discussed in the family circle, but that was a very exceptional treatment.\nThank you,", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728131.1904676, - end_time=1744728132.6833503, - first_token_time=1744728131.2394557, - last_token_time=1744728132.6828275, - ), - GenerativeRequestStats( - request_id="ef73ae8a-4c8f-4c88-b303-cfff152ce378", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=True, - errored=False, - canceled=False, - targeted_start_time=1744728131.1891043, - queued_time=1744728131.1893764, - dequeued_time=1744728132.6859632, - scheduled_time=1744728132.6859682, - worker_start=1744728132.6860242, - request_start=1744728132.6862206, - request_end=1744728134.1805167, - worker_end=1744728134.1813161, - process_id=0, - ), - prompt="was. But her commendation, though costing her some trouble, could by no means satisfy Mr. Collins, and he was very soon obliged to take her Ladyship s praise into his own hands. Sir William stayed only a week at Hunsford; but his visit was long enough to convince him of his daughter s being most comfortably settled, and of her possessing such a husband and such a neighbour as were not often met with. While Sir William was with them, Mr. Collins devoted his mornings to driving him out in his gig, and showing him the country but when he went away, the whole family returned to their usual employments", # noqa: E501 - output=", and the sides of the family in which he was more particularly interested, to their respective places in the establishment. Here Jane was occasionally up as a substitute to her indolent sister, in her matron s stead, but was more frequently left idle, and with her hours of quietness, the unwelcome intrusion", # noqa: E501 - prompt_tokens=128, - output_tokens=64, - start_time=1744728132.6862206, - end_time=1744728134.1805167, - first_token_time=1744728132.7354612, - last_token_time=1744728134.1797993, - ), - ], - errored=[], - incomplete=[ - GenerativeTextErrorStats( - request_id="1b3def04-ca81-4f59-a56c-452a069d91af", - request_type="text_completions", - scheduler_info=SchedulerRequestInfo( - requested=True, - completed=False, - errored=True, - canceled=True, - targeted_start_time=1744728132.686177, - queued_time=1744728132.6866345, - dequeued_time=1744728134.1831052, - scheduled_time=1744728134.1831107, - worker_start=1744728134.183183, - request_start=1744728134.183544, - request_end=1744728135.2031732, - worker_end=1744728135.2033112, - process_id=0, - ), - prompt="is to tempt anyone to our humble abode. Our plain manner of living, our small rooms, and few domestics, and the little we see of the world, must make Hunsford extremely dull to a young lady like yourself; but I hope you will believe us grateful for the condescension, and that we have done everything in our power to prevent you spending your time unpleasantly. Elizabeth was eager with her thanks and assurances of happiness. She had spent six weeks with great enjoyment; and the pleasure of being with Charlotte, and the kind attention she had received, must make _her_ feel the obliged. Mr. Collins", # noqa: E501 - output=", who certainly had an eye to Elizabeth's manner, was glad _he was not to lose the curiosity she had given, and requested her away_ , _for the politeness of her conciliating manner would", # noqa: E501 - prompt_tokens=128, - output_tokens=43, - start_time=1744728134.183544, - end_time=1744728135.2031732, - first_token_time=1744728134.2323751, - last_token_time=1744728135.1950455, - error="TimeoutError: The request timed out before completing.", - ) - ], - args=BenchmarkArgs( - profile=SynchronousProfile(), - strategy_index=0, + """Create a minimal mock GenerativeBenchmark for testing purposes.""" + return GenerativeBenchmark( + run_id="test-run-gen", + run_index=0, + scheduler=SchedulerDict( strategy=SynchronousStrategy(), - max_number=None, - max_duration=10.0, - warmup_number=None, - warmup_duration=None, - cooldown_number=None, - cooldown_duration=None, + constraints={}, + state=SchedulerState(node_id=0, num_processes=1), ), + benchmarker=BenchmarkerDict( + profile=SynchronousProfile.create("synchronous", rate=None), + requests={}, + backend={}, + environment={}, + aggregators={}, + ), + env_args=StandardBaseDict(), + extras=StandardBaseDict(), run_stats=BenchmarkSchedulerStats( - start_time=1744728125.0772898, - end_time=1744728135.8407037, + start_time=1, + end_time=2, requests_made=StatusBreakdown( - successful=6, + successful=1, + incomplete=0, errored=0, - incomplete=1, - total=7, + total=1, ), - queued_time_avg=1.2821388585226876, - scheduled_time_delay_avg=7.96999250139509e-6, - scheduled_time_sleep_avg=0.0, - worker_start_delay_avg=6.399835859026228e-5, - worker_time_avg=1.4266603674207414, - worker_start_time_targeted_delay_avg=1.2825865745544434, - request_start_time_delay_avg=0.6414163964135307, - request_start_time_targeted_delay_avg=1.2827096836907523, - request_time_delay_avg=0.0004316908972603934, - request_time_avg=1.426228676523481, + queued_time_avg=0.1, + worker_resolve_start_delay_avg=0.1, + worker_resolve_time_avg=0.1, + worker_resolve_end_delay_avg=0.1, + finalized_delay_avg=0.1, + worker_targeted_start_delay_avg=0.1, + request_start_delay_avg=0.1, + request_time_avg=0.1, + request_targeted_delay_avg=0.1, + ), + start_time=1000.0, + end_time=2000.0, + metrics=GenerativeMetrics( + requests_per_second=_create_status_dist(), + request_concurrency=_create_status_dist(), + request_latency=_create_status_dist(), + prompt_token_count=_create_status_dist(), + output_token_count=_create_status_dist(), + total_token_count=_create_status_dist(), + time_to_first_token_ms=_create_status_dist(), + time_per_output_token_ms=_create_status_dist(), + inter_token_latency_ms=_create_status_dist(), + output_tokens_per_second=_create_status_dist(), + tokens_per_second=_create_status_dist(), ), - worker=GenerativeRequestsWorkerDescription( - backend_type="openai_http", - backend_target="http://localhost:8000", - backend_model="neuralmagic/Qwen2.5-7B-quantized.w8a8", - backend_info={ - "max_output_tokens": 16384, - "timeout": 300, - "http2": True, - "authorization": False, - "organization": None, - "project": None, - "text_completions_path": "/v1/completions", - "chat_completions_path": "/v1/chat/completions", - }, + request_totals=StatusBreakdown( + successful=1, + incomplete=0, + errored=0, + total=1, ), - requests_loader=GenerativeRequestLoaderDescription( - data='{"prompt_tokens": 128, "output_tokens": 64}', - data_args=None, - processor="neuralmagic/Qwen2.5-7B-quantized.w8a8", - processor_args=None, + requests=StatusBreakdown( + successful=[ + GenerativeRequestStats( + scheduler_info=ScheduledRequestInfo( + request_timings=GenerationRequestTimings( + request_start=1, + first_iteration=2, + last_iteration=6, + request_end=6, + ) + ), + request_id="a", + request_type="text_completions", + prompt="p", + request_args={}, + output="o", + iterations=1, + prompt_tokens=1, + output_tokens=2, + ) + ], + incomplete=[], + errored=[], + total=None, ), - extras={}, ) diff --git a/tests/unit/objects/test_pydantic.py b/tests/unit/objects/test_pydantic.py index b6c19a9a..515d95ab 100644 --- a/tests/unit/objects/test_pydantic.py +++ b/tests/unit/objects/test_pydantic.py @@ -1,7 +1,7 @@ import pytest from pydantic import computed_field -from guidellm.utils.pydantic import StandardBaseModel +from guidellm.utils.pydantic_utils import StandardBaseModel class ExampleModel(StandardBaseModel): diff --git a/tests/unit/objects/test_statistics.py b/tests/unit/objects/test_statistics.py index fa8cccd0..855bfa5f 100644 --- a/tests/unit/objects/test_statistics.py +++ b/tests/unit/objects/test_statistics.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from guidellm.objects import ( +from guidellm.utils import ( DistributionSummary, Percentiles, RunningStats, diff --git a/tox.ini b/tox.ini index 08fc27b9..4e2fde9f 100644 --- a/tox.ini +++ b/tox.ini @@ -35,6 +35,14 @@ commands = python -m pytest tests/e2e {posargs} +[testenv:test-paths] +description = Run provided paths tests +deps = + .[dev] +commands = + python -m pytest {posargs} + + [testenv:quality] description = Run all quality checks deps =