Skip to content

Commit da02ee8

Browse files
authored
[GuideLLM Refactor] mock server package creation (#357)
## **Summary** Introduces a comprehensive mock server implementation that simulates OpenAI and vLLM APIs with configurable timing characteristics and response patterns. The mock server enables realistic performance testing and validation of GuideLLM benchmarking workflows without requiring actual model deployments, supporting both streaming and non-streaming endpoints with proper token counting, latency simulation (TTFT/ITL), and error handling. ## **Details** - Added `mock_server` package with modular architecture including configuration, handlers, models, server, and utilities - Implemented `MockServerConfig` with Pydantic settings for centralized configuration management supporting environment variables - Created HTTP request handlers for OpenAI-compatible endpoints: - `ChatCompletionsHandler` for `/v1/chat/completions` with streaming support - `CompletionsHandler` for `/v1/completions` legacy endpoint - `TokenizerHandler` for vLLM-compatible `/tokenize` and `/detokenize` endpoints - Added comprehensive Pydantic models for request/response validation compatible with both OpenAI and vLLM API specifications - Implemented high-performance Sanic-based server with CORS support, middleware, and proper error handling - Created mock tokenizer and text generation utilities with deterministic token generation for reproducible testing - Added timing generators for realistic latency simulation including TTFT (Time To First Token) and ITL (Inter-Token Latency) - Included comprehensive test suite with integration tests using real HTTP server instances ## **Test Plan** - Unit/integration style tests added to automation ## **Related Issues** - Part of the larger scheduler refactor initiative --- - [x] "I certify that all code in this PR is my own, except as noted below." ## **Use of AI** - [x] Includes AI-assisted code completion - [x] Includes code generated by an AI application - [ ] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`) --------- Signed-off-by: Mark Kurtz <[email protected]>
1 parent 18e95be commit da02ee8

File tree

11 files changed

+2315
-0
lines changed

11 files changed

+2315
-0
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""
2+
GuideLLM Mock Server for OpenAI and vLLM API compatibility.
3+
"""
4+
5+
from .config import MockServerConfig
6+
from .server import MockServer
7+
8+
__all__ = ["MockServer", "MockServerConfig"]

src/guidellm/mock_server/config.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Configuration settings for the mock server component.
3+
4+
Provides centralized configuration management for mock server behavior including
5+
network binding, model identification, response timing characteristics, and token
6+
generation parameters. Supports environment variable configuration for deployment
7+
flexibility with automatic validation through Pydantic settings.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
from pydantic import Field
13+
from pydantic_settings import BaseSettings
14+
15+
__all__ = ["MockServerConfig"]
16+
17+
18+
class MockServerConfig(BaseSettings):
19+
"""
20+
Configuration settings for mock server behavior and deployment.
21+
22+
Centralizes all configurable parameters for mock server operation including
23+
network settings, model identification, response timing characteristics, and
24+
token generation behavior. Environment variables with GUIDELLM_MOCK_SERVER_
25+
prefix override default values for deployment flexibility.
26+
27+
Example:
28+
::
29+
config = MockServerConfig(host="0.0.0.0", port=8080, model="custom-model")
30+
# Use with environment variables:
31+
# GUIDELLM_MOCK_SERVER_HOST=127.0.0.1 GUIDELLM_MOCK_SERVER_PORT=9000
32+
"""
33+
34+
host: str = Field(
35+
default="127.0.0.1", description="Host address to bind the server to"
36+
)
37+
port: int = Field(default=8000, description="Port number to bind the server to")
38+
workers: int = Field(default=1, description="Number of worker processes to spawn")
39+
model: str = Field(
40+
default="llama-3.1-8b-instruct",
41+
description="Model name to present in API responses",
42+
)
43+
processor: str | None = Field(
44+
default=None,
45+
description=(
46+
"Processor type to use for token stats, tokenize, and detokenize. "
47+
"If None, a mock one is created."
48+
),
49+
)
50+
request_latency: float = Field(
51+
default=3.0,
52+
description="Base request latency in seconds for non-streaming responses",
53+
)
54+
request_latency_std: float = Field(
55+
default=0.0,
56+
description="Standard deviation for request latency variation",
57+
)
58+
ttft_ms: float = Field(
59+
default=150.0,
60+
description="Time to first token in milliseconds for streaming responses",
61+
)
62+
ttft_ms_std: float = Field(
63+
default=0.0,
64+
description="Standard deviation for time to first token variation",
65+
)
66+
itl_ms: float = Field(
67+
default=10.0,
68+
description="Inter-token latency in milliseconds for streaming responses",
69+
)
70+
itl_ms_std: float = Field(
71+
default=0.0,
72+
description="Standard deviation for inter-token latency variation",
73+
)
74+
output_tokens: int = Field(
75+
default=128, description="Number of output tokens to generate in responses"
76+
)
77+
output_tokens_std: float = Field(
78+
default=0.0,
79+
description="Standard deviation for output token count variation",
80+
)
81+
82+
class Config:
83+
env_prefix = "GUIDELLM_MOCK_SERVER_"
84+
case_sensitive = False
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""
2+
HTTP request handlers for the GuideLLM mock server.
3+
4+
This module exposes request handlers that implement OpenAI-compatible API endpoints
5+
for the mock server. The handlers provide realistic LLM simulation capabilities
6+
including chat completions, legacy completions, and tokenization services with
7+
configurable timing characteristics, token counting, and proper error handling to
8+
support comprehensive benchmarking and testing scenarios.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
from .chat_completions import ChatCompletionsHandler
14+
from .completions import CompletionsHandler
15+
from .tokenizer import TokenizerHandler
16+
17+
__all__ = ["ChatCompletionsHandler", "CompletionsHandler", "TokenizerHandler"]
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
"""
2+
OpenAI Chat Completions API endpoint handler for the mock server.
3+
4+
Provides a complete implementation of the /v1/chat/completions endpoint that simulates
5+
realistic LLM behavior with configurable timing characteristics. Supports both streaming
6+
and non-streaming responses with proper token counting, latency simulation including
7+
TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
8+
handling for comprehensive benchmarking scenarios.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import asyncio
14+
import json
15+
import math
16+
import time
17+
import uuid
18+
19+
from pydantic import ValidationError
20+
from sanic import response
21+
from sanic.request import Request
22+
from sanic.response import HTTPResponse, ResponseStream
23+
from transformers import PreTrainedTokenizer
24+
25+
from guidellm.mock_server.config import MockServerConfig
26+
from guidellm.mock_server.models import (
27+
ChatCompletionChoice,
28+
ChatCompletionsRequest,
29+
ChatCompletionsResponse,
30+
ChatMessage,
31+
ErrorDetail,
32+
ErrorResponse,
33+
Usage,
34+
)
35+
from guidellm.mock_server.utils import (
36+
MockTokenizer,
37+
create_fake_text,
38+
create_fake_tokens_str,
39+
sample_number,
40+
times_generator,
41+
)
42+
43+
__all__ = ["ChatCompletionsHandler"]
44+
45+
46+
class ChatCompletionsHandler:
47+
"""
48+
Handles OpenAI Chat Completions API requests with realistic LLM simulation.
49+
50+
Implements the /v1/chat/completions endpoint behavior including request validation,
51+
response generation, and timing simulation. Supports both streaming and
52+
non-streaming modes with configurable latency characteristics for comprehensive
53+
benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
54+
counting and realistic text generation.
55+
56+
Example:
57+
::
58+
config = MockServerConfig(ttft_ms=100, itl_ms=50)
59+
handler = ChatCompletionsHandler(config)
60+
response = await handler.handle(request)
61+
"""
62+
63+
def __init__(self, config: MockServerConfig) -> None:
64+
"""
65+
Initialize the Chat Completions handler with server configuration.
66+
67+
:param config: Mock server configuration containing timing and behavior settings
68+
"""
69+
self.config = config
70+
self.tokenizer = (
71+
MockTokenizer()
72+
if config.processor is None
73+
else PreTrainedTokenizer.from_pretrained(config.processor)
74+
)
75+
76+
async def handle(self, request: Request) -> HTTPResponse:
77+
"""
78+
Process incoming chat completion requests with validation and routing.
79+
80+
Validates the request payload, handles errors gracefully, and routes to
81+
appropriate streaming or non-streaming response handlers based on the
82+
request configuration.
83+
84+
:param request: Sanic HTTP request containing chat completion parameters
85+
:return: HTTP response with completion data or error information
86+
:raises ValidationError: When request payload fails validation
87+
:raises JSONDecodeError: When request contains invalid JSON
88+
"""
89+
try:
90+
# Parse and validate request
91+
req_data = ChatCompletionsRequest(**request.json)
92+
except ValidationError as exc:
93+
return response.json(
94+
ErrorResponse(
95+
error=ErrorDetail(
96+
message=f"Invalid request: {str(exc)}",
97+
type="invalid_request_error",
98+
code="invalid_request",
99+
)
100+
).model_dump(),
101+
status=400,
102+
)
103+
except (json.JSONDecodeError, TypeError):
104+
return response.json(
105+
ErrorResponse(
106+
error=ErrorDetail(
107+
message="Invalid JSON in request body",
108+
type="invalid_request_error",
109+
code="invalid_json",
110+
)
111+
).model_dump(),
112+
status=400,
113+
)
114+
115+
# Handle streaming vs non-streaming
116+
if req_data.stream:
117+
return await self._handle_stream(req_data)
118+
else:
119+
return await self._handle_non_stream(req_data)
120+
121+
async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
122+
"""
123+
Generate complete non-streaming chat completion response.
124+
125+
Simulates realistic LLM behavior with TTFT and ITL delays, generates
126+
appropriate token counts, and returns a complete response with usage
127+
statistics and generated content.
128+
129+
:param req: Validated chat completion request parameters
130+
:return: Complete HTTP response with generated completion data
131+
"""
132+
# TTFT delay
133+
await asyncio.sleep(
134+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135+
)
136+
137+
# Token counts
138+
prompt_text = self.tokenizer.apply_chat_template(req.messages)
139+
prompt_tokens = len(self.tokenizer(prompt_text))
140+
max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
141+
completion_tokens_count = min(
142+
sample_number(self.config.output_tokens, self.config.output_tokens_std),
143+
max_tokens,
144+
)
145+
146+
# ITL delay
147+
itl_delay = 0.0
148+
delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
149+
for _ in range(int(completion_tokens_count) - 1):
150+
itl_delay += next(delays_iter)
151+
await asyncio.sleep(itl_delay / 1000.0)
152+
153+
# Response
154+
chat_response = ChatCompletionsResponse(
155+
id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
156+
model=req.model,
157+
choices=[
158+
ChatCompletionChoice(
159+
index=0,
160+
message=ChatMessage(
161+
role="assistant",
162+
content=create_fake_text(
163+
int(completion_tokens_count), self.tokenizer
164+
),
165+
),
166+
finish_reason="stop",
167+
)
168+
],
169+
usage=Usage(
170+
prompt_tokens=prompt_tokens,
171+
completion_tokens=int(completion_tokens_count),
172+
),
173+
system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
174+
)
175+
176+
return response.json(chat_response.model_dump())
177+
178+
async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
179+
"""
180+
Generate streaming chat completion response with real-time token delivery.
181+
182+
Creates a streaming response that delivers tokens incrementally with
183+
realistic timing delays. Supports optional usage statistics in the final
184+
stream chunk when requested via stream_options.
185+
186+
:param req: Validated chat completion request with streaming enabled
187+
:return: Streaming HTTP response delivering tokens with proper timing
188+
"""
189+
190+
async def generate_stream(stream_response):
191+
completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
192+
193+
# TTFT delay
194+
await asyncio.sleep(
195+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
196+
)
197+
198+
# Token counts
199+
prompt_text = self.tokenizer.apply_chat_template(req.messages)
200+
prompt_tokens = len(self.tokenizer(prompt_text))
201+
max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
202+
completion_tokens_count = int(
203+
min(
204+
sample_number(
205+
self.config.output_tokens, self.config.output_tokens_std
206+
),
207+
max_tokens,
208+
)
209+
)
210+
211+
# Send tokens
212+
tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213+
delays_iter = iter(
214+
times_generator(self.config.itl_ms, self.config.itl_ms_std)
215+
)
216+
217+
for index, token in enumerate(tokens):
218+
if index > 0:
219+
itl_delay = next(delays_iter)
220+
await asyncio.sleep(itl_delay / 1000.0)
221+
222+
chunk_data = {
223+
"id": completion_id,
224+
"object": "chat.completion.chunk",
225+
"created": int(time.time()),
226+
"model": req.model,
227+
"choices": [
228+
{
229+
"index": 0,
230+
"delta": {"content": token},
231+
"finish_reason": None,
232+
}
233+
],
234+
}
235+
await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236+
237+
# Send final chunk with finish reason
238+
final_chunk = {
239+
"id": completion_id,
240+
"object": "chat.completion.chunk",
241+
"created": int(time.time()),
242+
"model": req.model,
243+
"choices": [
244+
{
245+
"index": 0,
246+
"delta": {},
247+
"finish_reason": "stop",
248+
}
249+
],
250+
}
251+
await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252+
253+
# Send usage if requested
254+
if req.stream_options and req.stream_options.include_usage:
255+
usage_chunk = {
256+
"id": completion_id,
257+
"object": "chat.completion.chunk",
258+
"created": int(time.time()),
259+
"model": req.model,
260+
"choices": [],
261+
"usage": {
262+
"prompt_tokens": prompt_tokens,
263+
"completion_tokens": completion_tokens_count,
264+
"total_tokens": prompt_tokens + completion_tokens_count,
265+
},
266+
}
267+
await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268+
269+
# End stream
270+
await stream_response.write("data: [DONE]\n\n")
271+
272+
return ResponseStream( # type: ignore[return-value]
273+
generate_stream,
274+
content_type="text/event-stream",
275+
headers={
276+
"Cache-Control": "no-cache",
277+
"Connection": "keep-alive",
278+
"X-Accel-Buffering": "no",
279+
},
280+
)

0 commit comments

Comments
 (0)