diff --git a/README.md b/README.md index 78933336..8b06f1c3 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference Needs -[![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPi Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPi%20Release)](https://pypi.python.org/pypi/guidellm) [![Pypi Release](https://img.shields.io/pypi/v/guidellm-nightly.svg?label=PyPi%20Nightly)](https://pypi.python.org/pypi/guidellm-nightly) [![Python Versions](https://img.shields.io/pypi/pyversions/guidellm.svg?label=Python)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml) +[![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Pypi Release](https://img.shields.io/pypi/v/guidellm-nightly.svg?label=PyPI%20Nightly)](https://pypi.python.org/pypi/guidellm-nightly) [![Python Versions](https://img.shields.io/pypi/pyversions/guidellm.svg?label=Python)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml) ## Overview @@ -65,10 +65,12 @@ To run a GuideLLM evaluation, use the `guidellm` command with the appropriate mo ```bash guidellm \ --target "http://localhost:8000/v1" \ - --model "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16" + --model "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16" \ + --data-type emulated \ + --data "prompt_tokens=512,generated_tokens=128" ``` -The above command will begin the evaluation and output progress updates similar to the following: +The above command will begin the evaluation and output progress updates similar to the following (if running on a different server, be sure to update the target!): Notes: @@ -88,17 +90,39 @@ The end of the output will include important performance summary metrics such as Sample GuideLLM benchmark end output -### Advanced Settings +### Configurations -GuideLLM provides various options to customize evaluations, including setting the duration of each benchmark run, the number of concurrent requests, and the request rate. For a complete list of options and advanced settings, see the [GuideLLM CLI Documentation](https://github.com/neuralmagic/guidellm/blob/main/docs/guides/cli.md). +GuideLLM provides various CLI and environment options to customize evaluations, including setting the duration of each benchmark run, the number of concurrent requests, and the request rate. -Some common advanced settings include: +Some common configurations for the CLI include: -- `--rate-type`: The rate to use for benchmarking. Options include `sweep` (shown above), `synchronous` (one request at a time), `throughput` (all requests at once), `constant` (a constant rate defined by `--rate`), and `poisson` (a poisson distribution rate defined by `--rate`). -- `--data-type`: The data to use for the benchmark. Options include `emulated` (default shown above, emulated to match a given prompt and output length), `transformers` (a transformers dataset), and `file` (a {text, json, jsonl, csv} file with a list of prompts). +- `--rate-type`: The rate to use for benchmarking. Options include `sweep`, `synchronous`, `throughput`, `constant`, and `poisson`. + - `--rate-type sweep`: (default) Sweep runs through the full range of performance for the server. Starting with a `synchronous` rate first, then `throughput`, and finally 10 `constant` rates between the min and max request rate found. + - `--rate-type synchronous`: Synchronous runs requests in a synchronous manner, one after the other. + - `--rate-type throughput`: Throughput runs requests in a throughput manner, sending requests as fast as possible. + - `--rate-type constant`: Constant runs requests at a constant rate. Specify the rate in requests per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`. + - `--rate-type poisson`: Poisson draws from a poisson distribution with the mean at the specified rate, adding some real-world variance to the runs. Specify the rate in requests per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`. +- `--data-type`: The data to use for the benchmark. Options include `emulated`, `transformers`, and `file`. + - `--data-type emulated`: Emulated supports an EmulationConfig in string or file format for the `--data` argument to generate fake data. Specify the number of prompt tokens at a minimum and optionally the number of output tokens and other params for variance in the length. For example, `--data "prompt_tokens=128"`, `--data "prompt_tokens=128,generated_tokens=128"`, or `--data "prompt_tokens=128,prompt_tokens_variance=10"`. + - `--data-type file`: File supports a file path or URL to a file for the `--data` argument. The file should contain data encoded as a CSV, JSONL, TXT, or JSON/YAML file with a single prompt per line for CSV, JSONL, and TXT or a list of prompts for JSON/YAML. For example, `--data "data.txt"` where data.txt contents are `"prompt1\nprompt2\nprompt3"`. + - `--data-type transformers`: Transformers supports a dataset name or dataset file path for the `--data` argument. For example, `--data "neuralmagic/LLM_compression_calibration"`. - `--max-seconds`: The maximum number of seconds to run each benchmark. The default is 120 seconds. - `--max-requests`: The maximum number of requests to run in each benchmark. +For a full list of supported CLI arguments, run the following command: + +```bash +guidellm --help +``` + +For a full list of configuration options, run the following command: + +```bash +guidellm-config +``` + +For further information, see the [GuideLLM Documentation](#Documentation). + ## Resources ### Documentation @@ -109,7 +133,7 @@ Our comprehensive documentation provides detailed guides and resources to help y - [**Installation Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md) - Step-by-step instructions to install GuideLLM, including prerequisites and setup tips. - [**Architecture Overview**](https://github.com/neuralmagic/guidellm/tree/main/docs/architecture.md) - A detailed look at GuideLLM's design, components, and how they interact. -- [**CLI Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/cli_usage.md) - Comprehensive usage information for running GuideLLM via the command line, including available commands and options. +- [**CLI Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/cli.md) - Comprehensive usage information for running GuideLLM via the command line, including available commands and options. - [**Configuration Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/configuration.md) - Instructions on configuring GuideLLM to suit various deployment needs and performance goals. ### Supporting External Documentation diff --git a/docs/assets/sample-benchmark.gif b/docs/assets/sample-benchmark.gif deleted file mode 100644 index f0c86146..00000000 Binary files a/docs/assets/sample-benchmark.gif and /dev/null differ diff --git a/docs/assets/sample-benchmarks.gif b/docs/assets/sample-benchmarks.gif new file mode 100644 index 00000000..160763b7 Binary files /dev/null and b/docs/assets/sample-benchmarks.gif differ diff --git a/pyproject.toml b/pyproject.toml index dcd14961..db2e65a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ dev = [ [project.entry-points.console_scripts] guidellm = "guidellm.main:generate_benchmark_report_cli" +guidellm-config = "guidellm.config:print_config" # ************************************************ diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py index c2432b20..d71c5f66 100644 --- a/src/guidellm/backend/base.py +++ b/src/guidellm/backend/base.py @@ -1,9 +1,14 @@ +import asyncio import functools from abc import ABC, abstractmethod from typing import AsyncGenerator, Dict, List, Literal, Optional, Type, Union from loguru import logger from pydantic import BaseModel +from transformers import ( # type: ignore # noqa: PGH003 + AutoTokenizer, + PreTrainedTokenizer, +) from guidellm.core import TextGenerationRequest, TextGenerationResult @@ -103,10 +108,21 @@ def create(cls, backend_type: BackendEngine, **kwargs) -> "Backend": return Backend._registry[backend_type](**kwargs) def __init__(self, type_: BackendEngine, target: str, model: str): + """ + Base constructor for the Backend class. + Calls into test_connection to ensure the backend is reachable. + Ensure all setup is done in the subclass constructor before calling super. + + :param type_: The type of the backend. + :param target: The target URL for the backend. + :param model: The model used by the backend. + """ self._type = type_ self._target = target self._model = model + self.test_connection() + @property def default_model(self) -> str: """ @@ -148,6 +164,48 @@ def model(self) -> str: """ return self._model + def model_tokenizer(self) -> PreTrainedTokenizer: + """ + Get the tokenizer for the backend model. + + :return: The tokenizer instance. + """ + return AutoTokenizer.from_pretrained(self.model) + + def test_connection(self) -> bool: + """ + Test the connection to the backend by running a short text generation request. + If successful, returns True, otherwise raises an exception. + + :return: True if the connection is successful. + :rtype: bool + :raises ValueError: If the connection test fails. + """ + try: + asyncio.get_running_loop() + is_async = True + except RuntimeError: + is_async = False + + if is_async: + logger.warning("Running in async mode, cannot test connection") + return True + + try: + request = TextGenerationRequest( + prompt="Test connection", output_token_count=5 + ) + + asyncio.run(self.submit(request)) + return True + except Exception as err: + raise_err = RuntimeError( + f"Backend connection test failed for backend type={self.type_} " + f"with target={self.target} and model={self.model} with error: {err}" + ) + logger.error(raise_err) + raise raise_err from err + async def submit(self, request: TextGenerationRequest) -> TextGenerationResult: """ Submit a text generation request and return the result. diff --git a/src/guidellm/config.py b/src/guidellm/config.py index 8d16cd60..c3d950ec 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -1,5 +1,6 @@ +import json from enum import Enum -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence from pydantic import BaseModel, Field, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict @@ -10,6 +11,7 @@ "Environment", "LoggingSettings", "OpenAISettings", + "print_config", "ReportGenerationSettings", "Settings", "reload_settings", @@ -70,7 +72,6 @@ class DatasetSettings(BaseModel): preferred_data_splits: List[str] = Field( default_factory=lambda: ["test", "tst", "validation", "val", "train"] ) - default_tokenizer: str = "neuralmagic/Meta-Llama-3.1-8B-FP8" class EmulatedDataSettings(BaseModel): @@ -163,6 +164,53 @@ def set_default_source(cls, values): return values + def generate_env_file(self) -> str: + """ + Generate the .env file from the current settings + """ + return Settings._recursive_generate_env( + self, + self.model_config["env_prefix"], # type: ignore # noqa: PGH003 + self.model_config["env_nested_delimiter"], # type: ignore # noqa: PGH003 + ) + + @staticmethod + def _recursive_generate_env(model: BaseModel, prefix: str, delimiter: str) -> str: + env_file = "" + add_models = [] + for key, value in model.model_dump().items(): + if isinstance(value, BaseModel): + # add nested properties to be processed after the current level + add_models.append((key, value)) + continue + + dict_values = ( + { + f"{prefix}{key.upper()}{delimiter}{sub_key.upper()}": sub_value + for sub_key, sub_value in value.items() + } + if isinstance(value, dict) + else {f"{prefix}{key.upper()}": value} + ) + + for tag, sub_value in dict_values.items(): + if isinstance(sub_value, Sequence) and not isinstance(sub_value, str): + value_str = ",".join(f'"{item}"' for item in sub_value) + env_file += f"{tag}=[{value_str}]\n" + elif isinstance(sub_value, Dict): + value_str = json.dumps(sub_value) + env_file += f"{tag}={value_str}\n" + elif not sub_value: + env_file += f"{tag}=\n" + else: + env_file += f'{tag}="{sub_value}"\n' + + for key, value in add_models: + env_file += Settings._recursive_generate_env( + value, f"{prefix}{key.upper()}{delimiter}", delimiter + ) + return env_file + settings = Settings() @@ -173,3 +221,14 @@ def reload_settings(): """ new_settings = Settings() settings.__dict__.update(new_settings.__dict__) + + +def print_config(): + """ + Print the current configuration settings + """ + print(f"Settings: \n{settings.generate_env_file()}") # noqa: T201 + + +if __name__ == "__main__": + print_config() diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py index 133d12ee..4f7315c5 100644 --- a/src/guidellm/core/request.py +++ b/src/guidellm/core/request.py @@ -28,3 +28,17 @@ class TextGenerationRequest(Serializable): default_factory=dict, description="The parameters for the text generation request.", ) + + def __str__(self) -> str: + prompt_short = ( + self.prompt[:32] + "..." + if self.prompt and len(self.prompt) > 32 # noqa: PLR2004 + else self.prompt + ) + + return ( + f"TextGenerationRequest(id={self.id}, " + f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, " + f"output_token_count={self.output_token_count}, " + f"params={self.params})" + ) diff --git a/src/guidellm/main.py b/src/guidellm/main.py index 2f7f9c44..c0bea832 100644 --- a/src/guidellm/main.py +++ b/src/guidellm/main.py @@ -22,6 +22,7 @@ @click.option( "--target", type=str, + required=True, help=( "The target path or url for the backend to evaluate. " "Ex: 'http://localhost:8000/v1'" @@ -33,8 +34,8 @@ default="openai_server", help=( "The backend to use for benchmarking. " - "The default is OpenAI Server enabling compatiability with any server that " - "follows the OpenAI spec including vLLM" + "The default is OpenAI Server enabling compatability with any server that " + "follows the OpenAI spec including vLLM." ), ) @click.option( @@ -49,18 +50,25 @@ @click.option( "--data", type=str, - default=None, + required=True, help=( - "The data source to use for benchmarking. Depending on the data-type, " - "it should be a path to a file, a dataset name, " - "or a configuration for emulated data." + "The data source to use for benchmarking. " + "Depending on the data-type, it should be a " + "path to a data file containing prompts to run (ex: data.txt), " + "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), " + "or a configuration for emulated data " + "(ex: 'prompt_tokens=128,generated_tokens=128')." ), ) @click.option( "--data-type", type=click.Choice(["emulated", "file", "transformers"]), - default="emulated", - help="The type of data given for benchmarking", + required=True, + help=( + "The type of data to use for benchmarking. " + "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' " + "for a HuggingFace dataset. Specify the data source with the --data flag." + ), ) @click.option( "--tokenizer", @@ -68,7 +76,10 @@ default=None, help=( "The tokenizer to use for calculating the number of prompt tokens. " - "If not provided, will use a Llama 3.1 tokenizer." + "This should match the tokenizer used by the model." + "By default, it will use the --model flag to determine the tokenizer. " + "If not provided and the model is not available, will raise an error. " + "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'" ), ) @click.option( @@ -77,15 +88,21 @@ default="sweep", help=( "The type of request rate to use for benchmarking. " - "The default is sweep, which will run a synchronous and throughput " - "rate-type and then interfill with constant rates between the two values" + "Use sweep to run a full range from synchronous to throughput (default), " + "synchronous for sending requests one after the other, " + "throughput to send requests as fast as possible, " + "constant for a fixed request rate, " + "or poisson for a real-world variable request rate." ), ) @click.option( "--rate", type=float, default=None, - help="The request rate to use for constant and poisson rate types", + help=( + "The request rate to use for constant and poisson rate types. " + "To run multiple, provide the flag multiple times. " + ), multiple=True, ) @click.option( @@ -117,18 +134,20 @@ @click.option( "--output-path", type=str, - default="guidance_report.json", + default=None, help=( - "The output path to save the output report to. " - "The default is guidance_report.json." + "The output path to save the output report to for loading later. " + "Ex: guidance_report.json. " + "The default is None, meaning no output is saved and results are only " + "printed to the console." ), ) @click.option( - "--disable-continuous-refresh", + "--enable-continuous-refresh", is_flag=True, default=False, help=( - "Disable continual refreshing of the output table in the CLI " + "Enable continual refreshing of the output table in the CLI " "until the user exits. " ), ) @@ -144,7 +163,7 @@ def generate_benchmark_report_cli( max_seconds: Optional[int], max_requests: Optional[int], output_path: str, - disable_continuous_refresh: bool, + enable_continuous_refresh: bool, ): """ Generate a benchmark report for a specified backend and dataset. @@ -161,7 +180,7 @@ def generate_benchmark_report_cli( max_seconds=max_seconds, max_requests=max_requests, output_path=output_path, - cont_refresh_table=not disable_continuous_refresh, + cont_refresh_table=enable_continuous_refresh, ) @@ -177,7 +196,7 @@ def generate_benchmark_report( max_seconds: Optional[int], max_requests: Optional[int], output_path: str, - cont_refresh_table: bool = True, + cont_refresh_table: bool, ) -> GuidanceReport: """ Generate a benchmark report for a specified backend and dataset. @@ -213,14 +232,26 @@ def generate_benchmark_report( request_generator: RequestGenerator - # Create request generator + # Create tokenizer and request generator + tokenizer_inst = tokenizer + if not tokenizer_inst: + try: + tokenizer_inst = backend_inst.model_tokenizer() + except Exception as err: + raise ValueError( + "Could not load model's tokenizer, " + "--tokenizer must be provided for request generation" + ) from err + if data_type == "emulated": - request_generator = EmulatedRequestGenerator(config=data, tokenizer=tokenizer) + request_generator = EmulatedRequestGenerator( + config=data, tokenizer=tokenizer_inst + ) elif data_type == "file": - request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer) + request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst) elif data_type == "transformers": request_generator = TransformersDatasetRequestGenerator( - dataset=data, tokenizer=tokenizer + dataset=data, tokenizer=tokenizer_inst ) else: raise ValueError(f"Unknown data type: {data_type}") @@ -252,8 +283,14 @@ def generate_benchmark_report( # Save and print report guidance_report = GuidanceReport() guidance_report.benchmarks.append(report) - guidance_report.save_file(output_path) - guidance_report.print(output_path, continual_refresh=cont_refresh_table) + + if output_path: + guidance_report.save_file(output_path) + + guidance_report.print( + save_path=output_path if output_path is not None else "stdout", + continual_refresh=cont_refresh_table, + ) return guidance_report diff --git a/src/guidellm/request/base.py b/src/guidellm/request/base.py index 52935b76..242bf894 100644 --- a/src/guidellm/request/base.py +++ b/src/guidellm/request/base.py @@ -3,7 +3,7 @@ import time from abc import ABC, abstractmethod from queue import Empty, Full, Queue -from typing import Iterator, Literal, Optional, Union +from typing import Iterator, Literal, Union from loguru import logger from transformers import ( # type: ignore # noqa: PGH003 @@ -11,7 +11,6 @@ PreTrainedTokenizer, ) -from guidellm.config import settings from guidellm.core.request import TextGenerationRequest __all__ = ["GenerationMode", "RequestGenerator"] @@ -41,7 +40,7 @@ def __init__( self, type_: str, source: str, - tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + tokenizer: Union[str, PreTrainedTokenizer], mode: GenerationMode = "async", async_queue_size: int = 50, ): @@ -53,19 +52,16 @@ def __init__( self._stop_event: threading.Event = threading.Event() if not tokenizer: - self._tokenizer = AutoTokenizer.from_pretrained( - settings.dataset.default_tokenizer - ) - logger.info("Initialized fake tokenizer for request generation") - else: - self._tokenizer = ( - AutoTokenizer.from_pretrained(tokenizer) - if isinstance(tokenizer, str) - else tokenizer - ) - logger.info( - "Tokenizer initialized for request generation: {}", self._tokenizer - ) + err = "Tokenizer must be provided for request generation" + logger.error(err) + raise ValueError(err) + + self._tokenizer = ( + AutoTokenizer.from_pretrained(tokenizer) + if isinstance(tokenizer, str) + else tokenizer + ) + logger.info("Tokenizer initialized for request generation: {}", self._tokenizer) if self._mode == "async": self._thread = threading.Thread(target=self._populate_queue, daemon=True) diff --git a/tests/unit/backend/test_base.py b/tests/unit/backend/test_base.py index 9247eb1e..edd61a90 100644 --- a/tests/unit/backend/test_base.py +++ b/tests/unit/backend/test_base.py @@ -10,6 +10,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse(type_="final", output="Test") @@ -48,6 +51,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse( type_="token_iter", @@ -91,6 +97,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse(type_="final", output="Test") @@ -110,6 +119,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse(type_="token_iter", add_token="Token") yield GenerativeResponse(type_="token_iter", add_token=" ") @@ -132,6 +144,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): if False: # simulate no yield yield @@ -152,6 +167,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse(type_="token_iter", add_token="Token") yield GenerativeResponse(type_="token_iter", add_token=" ") @@ -174,6 +192,9 @@ class MockBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + def available_models(self): return ["mock-model", "mock-model-2"] @@ -185,6 +206,39 @@ async def make_request(self, request): assert backend.default_model == "mock-model" +@pytest.mark.smoke() +def test_backend_test_connection(): + class MockBackend(Backend): + def __init__(self): + super().__init__("test", "http://localhost:8000", "mock-model") + + def available_models(self): + return ["mock-model", "mock-model-2"] + + async def make_request(self, request): + yield GenerativeResponse(type_="final", output="") + + assert MockBackend().test_connection() + + +@pytest.mark.smoke() +def test_backend_tokenizer(mock_auto_tokenizer): + class MockBackend(Backend): + def __init__(self): + super().__init__("test", "http://localhost:8000", "mock-model") + + def available_models(self): + return ["mock-model", "mock-model-2"] + + async def make_request(self, request): + yield GenerativeResponse(type_="final", output="") + + backend = MockBackend() + tokenizer = backend.model_tokenizer() + assert tokenizer is not None + assert tokenizer.tokenize("text") is not None + + @pytest.mark.regression() def test_backend_abstract_methods(): with pytest.raises(TypeError): @@ -194,6 +248,9 @@ class IncompleteBackend(Backend): def __init__(self): super().__init__("test", "http://localhost:8000", "mock-model") + def test_connection(self) -> bool: + return True + async def make_request(self, request): yield GenerativeResponse(type_="final", output="Test") diff --git a/tests/unit/request/test_base.py b/tests/unit/request/test_base.py index 8b75be10..73cf1b14 100644 --- a/tests/unit/request/test_base.py +++ b/tests/unit/request/test_base.py @@ -11,14 +11,16 @@ @pytest.mark.smoke() def test_request_generator_sync_constructor(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="sync") + generator = TestRequestGenerator(mode="sync", tokenizer="mock-tokenizer") assert generator.mode == "sync" assert generator.async_queue_size == 50 # Default value @pytest.mark.smoke() def test_request_generator_async_constructor(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="async", async_queue_size=10) + generator = TestRequestGenerator( + mode="async", tokenizer="mock-tokenizer", async_queue_size=10 + ) assert generator.mode == "async" assert generator.async_queue_size == 10 generator.stop() @@ -26,7 +28,7 @@ def test_request_generator_async_constructor(mock_auto_tokenizer): @pytest.mark.smoke() def test_request_generator_sync_iter(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="sync") + generator = TestRequestGenerator(mode="sync", tokenizer="mock-tokenizer") items = [] for item in generator: items.append(item) @@ -39,7 +41,7 @@ def test_request_generator_sync_iter(mock_auto_tokenizer): @pytest.mark.smoke() def test_request_generator_async_iter(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="async") + generator = TestRequestGenerator(mode="async", tokenizer="mock-tokenizer") items = [] for item in generator: items.append(item) @@ -53,7 +55,7 @@ def test_request_generator_async_iter(mock_auto_tokenizer): @pytest.mark.smoke() def test_request_generator_iter_calls_create_item(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="sync") + generator = TestRequestGenerator(mode="sync", tokenizer="mock-tokenizer") generator.create_item = Mock( # type: ignore return_value=TextGenerationRequest(prompt="Mock prompt"), ) @@ -70,7 +72,7 @@ def test_request_generator_iter_calls_create_item(mock_auto_tokenizer): @pytest.mark.smoke() def test_request_generator_async_iter_calls_create_item(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="sync") + generator = TestRequestGenerator(mode="sync", tokenizer="mock-tokenizer") generator.create_item = Mock( # type: ignore return_value=TextGenerationRequest(prompt="Mock prompt"), ) @@ -88,7 +90,9 @@ def test_request_generator_async_iter_calls_create_item(mock_auto_tokenizer): @pytest.mark.sanity() def test_request_generator_repr(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="sync", async_queue_size=100) + generator = TestRequestGenerator( + mode="sync", tokenizer="mock-tokenizer", async_queue_size=100 + ) repr_str = repr(generator) assert repr_str.startswith("RequestGenerator(") assert "mode=sync" in repr_str @@ -98,7 +102,7 @@ def test_request_generator_repr(mock_auto_tokenizer): @pytest.mark.sanity() def test_request_generator_stop(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="async") + generator = TestRequestGenerator(mode="async", tokenizer="mock-tokenizer") generator.stop() assert generator._stop_event.is_set() assert not generator._thread.is_alive() @@ -127,7 +131,9 @@ def _fake_tokenize(text: str) -> List[int]: @pytest.mark.regression() def test_request_generator_populate_queue(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="async", async_queue_size=2) + generator = TestRequestGenerator( + mode="async", tokenizer="mock-tokenizer", async_queue_size=2 + ) generator.create_item = Mock( # type: ignore return_value=TextGenerationRequest(prompt="Mock prompt") ) @@ -139,7 +145,9 @@ def test_request_generator_populate_queue(mock_auto_tokenizer): @pytest.mark.regression() def test_request_generator_async_stop_during_population(mock_auto_tokenizer): - generator = TestRequestGenerator(mode="async", async_queue_size=2) + generator = TestRequestGenerator( + mode="async", tokenizer="mock-tokenizer", async_queue_size=2 + ) generator.create_item = Mock( # type: ignore return_value=TextGenerationRequest(prompt="Mock prompt") ) diff --git a/tests/unit/request/test_emulated.py b/tests/unit/request/test_emulated.py index 699b1d6f..f6af1301 100644 --- a/tests/unit/request/test_emulated.py +++ b/tests/unit/request/test_emulated.py @@ -237,7 +237,7 @@ def test_endless_data_words_create_text(data, start, length, expected_text): @pytest.mark.smoke() -def test_emulated_request_generator_construction(mocker): +def test_emulated_request_generator_construction(mocker, mock_auto_tokenizer): mocker.patch( "guidellm.request.emulated.EmulatedConfig.create_config", return_value=EmulatedConfig(prompt_tokens=10), @@ -246,7 +246,9 @@ def test_emulated_request_generator_construction(mocker): "guidellm.request.emulated.EndlessTokens", return_value=EndlessTokens("word1 word2"), ) - generator = EmulatedRequestGenerator(config="mock_config", mode="sync") + generator = EmulatedRequestGenerator( + config="mock_config", tokenizer="mock-tokenizer", mode="sync" + ) assert isinstance(generator._config, EmulatedConfig) assert isinstance(generator._tokens, EndlessTokens) @@ -276,7 +278,9 @@ def test_emulated_request_generator_sample_prompt(mocker, mock_auto_tokenizer): "guidellm.request.emulated.EndlessTokens", return_value=EndlessTokens("word1 word2"), ) - generator = EmulatedRequestGenerator(config={"prompt_tokens": 3}, mode="sync") + generator = EmulatedRequestGenerator( + config={"prompt_tokens": 3}, tokenizer="mock-tokenizer", mode="sync" + ) prompt = generator.sample_prompt(3) assert prompt == "word1 word2 word1" @@ -285,7 +289,7 @@ def test_emulated_request_generator_sample_prompt(mocker, mock_auto_tokenizer): @pytest.mark.smoke() -def test_emulated_request_generator_random_seed(mocker): +def test_emulated_request_generator_random_seed(mocker, mock_auto_tokenizer): mocker.patch( "guidellm.request.emulated.EndlessTokens", return_value=EndlessTokens("word1 word2"), @@ -293,16 +297,19 @@ def test_emulated_request_generator_random_seed(mocker): rand_gen = EmulatedRequestGenerator( config={"prompt_tokens": 20, "prompt_tokens_variance": 10}, + tokenizer="mock-tokenizer", random_seed=42, mode="sync", ) rand_gen_comp_pos = EmulatedRequestGenerator( config={"prompt_tokens": 20, "prompt_tokens_variance": 10}, + tokenizer="mock-tokenizer", random_seed=42, mode="sync", ) rand_gen_comp_neg = EmulatedRequestGenerator( config={"prompt_tokens": 20, "prompt_tokens_variance": 10}, + tokenizer="mock-tokenizer", random_seed=43, mode="sync", ) @@ -339,13 +346,14 @@ def test_emulated_request_generator_lifecycle( config: Union[str, dict, Path], ): if config_type in ["dict", "json_str", "key_value_str"]: - generator = EmulatedRequestGenerator(config) + generator = EmulatedRequestGenerator(config, tokenizer="mock-tokenizer") elif config_type in ["file_str", "file_path"]: with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "test.json" file_path.write_text(config) # type: ignore generator = EmulatedRequestGenerator( - str(file_path) if config_type == "file_str" else file_path + str(file_path) if config_type == "file_str" else file_path, + tokenizer="mock-tokenizer", ) for _ in range(5): diff --git a/tests/unit/request/test_file.py b/tests/unit/request/test_file.py index b214e410..429de523 100644 --- a/tests/unit/request/test_file.py +++ b/tests/unit/request/test_file.py @@ -12,7 +12,7 @@ def test_file_request_generator_constructor(mock_auto_tokenizer): with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "example.txt" file_path.write_text("This is a test.\nThis is another test.") - generator = FileRequestGenerator(file_path) + generator = FileRequestGenerator(file_path, tokenizer="mock-tokenizer") assert generator._path == file_path assert generator._data == ["This is a test.", "This is another test."] assert generator._iterator is not None @@ -23,7 +23,9 @@ def test_file_request_generator_create_item(mock_auto_tokenizer): with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / "example.txt" file_path.write_text("This is a test.\nThis is another test.") - generator = FileRequestGenerator(file_path, mode="sync") + generator = FileRequestGenerator( + file_path, tokenizer="mock-tokenizer", mode="sync" + ) request = generator.create_item() assert isinstance(request, TextGenerationRequest) assert request.prompt == "This is a test." @@ -87,7 +89,7 @@ def test_file_request_generator_file_types_lifecycle( with tempfile.TemporaryDirectory() as temp_dir: file_path = Path(temp_dir) / f"example.{file_extension}" file_path.write_text(file_content) - generator = FileRequestGenerator(file_path) + generator = FileRequestGenerator(file_path, tokenizer="mock-tokenizer") for index, request in enumerate(generator): assert isinstance(request, TextGenerationRequest) diff --git a/tests/unit/request/test_transformers.py b/tests/unit/request/test_transformers.py index fcf933be..415fed65 100644 --- a/tests/unit/request/test_transformers.py +++ b/tests/unit/request/test_transformers.py @@ -28,6 +28,7 @@ def test_transformers_dataset_request_generator_constructor( dataset="dummy_dataset", split="train", column="text", + tokenizer="mock-tokenizer", ) assert generator._dataset == "dummy_dataset" assert generator._split == "train" @@ -45,6 +46,7 @@ def test_transformers_dataset_request_generator_create_item( dataset=create_sample_dataset_dict(), split="train", column="text", + tokenizer="mock-tokenizer", mode="sync", ) request = generator.create_item() @@ -83,7 +85,7 @@ def test_transformers_dataset_request_generator_lifecycle( return_value=dataset, ): generator = TransformersDatasetRequestGenerator( - dataset=dataset_arg, mode="sync" + dataset=dataset_arg, tokenizer="mock-tokenizer", mode="sync" ) for index, request in enumerate(generator): diff --git a/tests/unit/test_main.py b/tests/unit/test_main.py index 0ecc978c..6923092e 100644 --- a/tests/unit/test_main.py +++ b/tests/unit/test_main.py @@ -232,6 +232,8 @@ def test_generate_benchmark_report_cli_smoke( "openai_server", "--data-type", "emulated", + "--data", + "prompt_tokens=512", "--rate-type", "sweep", "--max-seconds", @@ -240,10 +242,12 @@ def test_generate_benchmark_report_cli_smoke( "10", "--output-path", "benchmark_report.json", - "--disable-continuous-refresh", ], ) + if result.stdout: + print(result.stdout) # noqa: T201 + assert result.exit_code == 0 assert "Benchmarks" in result.output assert "Generating report..." in result.output