Skip to content

Commit 65869d7

Browse files
committed
initial groundwork for new benchmark package and classes along with required changes, migrations, and fixes
1 parent 12835e0 commit 65869d7

37 files changed

+2296
-2658
lines changed

src/guidellm/backend/backend.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ def model(self) -> Optional[str]:
102102
"""
103103
...
104104

105+
@property
106+
@abstractmethod
107+
def info(self) -> Dict[str, Any]:
108+
"""
109+
:return: The information about the backend.
110+
"""
111+
...
112+
105113
def validate(self):
106114
"""
107115
Handle final setup and validate the backend is ready for use.

src/guidellm/backend/openai.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
__all__ = ["OpenAIHTTPBackend"]
2020

2121

22+
TEXT_COMPLETIONS_PATH = "/v1/completions"
23+
CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
24+
25+
2226
@Backend.register("openai_http")
2327
class OpenAIHTTPBackend(Backend):
2428
"""
@@ -61,6 +65,17 @@ def __init__(
6165
):
6266
super().__init__(type_="openai_http")
6367
self._target = target or settings.openai.base_url
68+
69+
if not self._target:
70+
raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
71+
72+
if self._target.endswith("/v1") or self._target.endswith("/v1/"):
73+
# backwards compatability, strip v1 off
74+
self._target = self._target[:-3]
75+
76+
if self._target.endswith("/"):
77+
self._target = self._target[:-1]
78+
6479
self._model = model
6580

6681
api_key = api_key or settings.openai.api_key
@@ -94,6 +109,22 @@ def model(self) -> Optional[str]:
94109
"""
95110
return self._model
96111

112+
@property
113+
def info(self) -> Dict[str, Any]:
114+
"""
115+
:return: The information about the backend.
116+
"""
117+
return {
118+
"max_output_tokens": self.max_output_tokens,
119+
"timeout": self.timeout,
120+
"http2": self.http2,
121+
"authorization": bool(self.authorization),
122+
"organization": self.organization,
123+
"project": self.project,
124+
"text_completions_path": TEXT_COMPLETIONS_PATH,
125+
"chat_completions_path": CHAT_COMPLETIONS_PATH,
126+
}
127+
97128
def check_setup(self):
98129
"""
99130
Check if the backend is setup correctly and can be used for requests.
@@ -379,12 +410,10 @@ async def _iterative_completions_request(
379410
headers: Dict,
380411
payload: Dict,
381412
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
382-
target = f"{self.target}/v1/"
383-
384413
if type_ == "text":
385-
target += "completions"
414+
target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
386415
elif type_ == "chat":
387-
target += "chat/completions"
416+
target = f"{self.target}{CHAT_COMPLETIONS_PATH}"
388417
else:
389418
raise ValueError(f"Unsupported type: {type_}")
390419

@@ -407,6 +436,8 @@ async def _iterative_completions_request(
407436
iter_count = 0
408437
start_time = time.time()
409438
iter_time = start_time
439+
first_iter_time: Optional[float] = None
440+
last_iter_time: Optional[float] = None
410441

411442
yield StreamingTextResponse(
412443
type_="start",
@@ -440,14 +471,19 @@ async def _iterative_completions_request(
440471

441472
data = json.loads(line.strip()[len("data: ") :])
442473
if delta := self._extract_completions_delta_content(type_, data):
474+
if first_iter_time is None:
475+
first_iter_time = iter_time
476+
last_iter_time = iter_time
477+
443478
iter_count += 1
444479
response_value += delta
445480

446481
yield StreamingTextResponse(
447482
type_="iter",
448483
value=response_value,
449-
start_time=start_time,
450484
iter_count=iter_count,
485+
start_time=start_time,
486+
first_iter_time=first_iter_time,
451487
delta=delta,
452488
time=iter_time,
453489
request_id=request_id,
@@ -477,6 +513,8 @@ async def _iterative_completions_request(
477513
),
478514
start_time=start_time,
479515
end_time=iter_time,
516+
first_iter_time=first_iter_time,
517+
last_iter_time=last_iter_time,
480518
iterations=iter_count,
481519
request_prompt_tokens=request_prompt_tokens,
482520
request_output_tokens=request_output_tokens,

src/guidellm/backend/response.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,19 @@ class ResponseSummary(BaseModel):
6767
6868
:param value: The final value returned from the request.
6969
:param request_args: The arguments used to make the request.
70+
:param iterations: The number of iterations in the request.
7071
:param start_time: The time the request started.
7172
:param end_time: The time the request ended.
72-
:param iterations: The number of iterations in the request.
73-
:param prompt_tokens: The number of tokens in the prompt, if any usage was returned.
74-
:param output_tokens: The number of tokens in the output, if any usage was returned.
73+
:param first_iter_time: The time the first iteration was received.
74+
:param last_iter_time: The time the last iteration was received.
75+
:param request_prompt_tokens: The number of tokens measured in the prompt
76+
for the request, if any.
77+
:param request_output_tokens: The number of tokens enforced for the output
78+
for the request, if any.
79+
:param response_prompt_tokens: The number of tokens measured in the prompt
80+
for the response, if any.
81+
:param response_output_tokens: The number of tokens measured in the output
82+
for the response, if any.
7583
:param request_id: The unique identifier for the request, if any.
7684
:param error: The error message, if any, returned from making the request.
7785
"""
@@ -81,6 +89,8 @@ class ResponseSummary(BaseModel):
8189
iterations: int = 0
8290
start_time: Optional[float]
8391
end_time: Optional[float]
92+
first_iter_time: Optional[float]
93+
last_iter_time: Optional[float]
8494
request_prompt_tokens: Optional[int] = None
8595
request_output_tokens: Optional[int] = None
8696
response_prompt_tokens: Optional[int] = None

src/guidellm/benchmark/__init__.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from .aggregator import AGG, BenchmarkAggregator, GenerativeBenchmarkAggregator
2+
from .benchmark import BENCH, Benchmark, GenerativeBenchmark
3+
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
4+
from .profile import (
5+
AsyncProfile,
6+
ConcurrentProfile,
7+
Profile,
8+
ProfileType,
9+
SweepProfile,
10+
SynchronousProfile,
11+
ThroughputProfile,
12+
create_profile,
13+
)
14+
15+
__all__ = [
16+
"AGG",
17+
"BENCH",
18+
"Benchmark",
19+
"BenchmarkAggregator",
20+
"GenerativeBenchmark",
21+
"GenerativeBenchmarkAggregator",
22+
"Benchmarker",
23+
"BenchmarkerResult",
24+
"GenerativeBenchmarker",
25+
"AsyncProfile",
26+
"ConcurrentProfile",
27+
"Profile",
28+
"ProfileType",
29+
"SweepProfile",
30+
"SynchronousProfile",
31+
"ThroughputProfile",
32+
"create_profile",
33+
]
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from abc import ABC, abstractmethod
2+
from collections import defaultdict
3+
from typing import DefaultDict, Generic, List, TypeVar
4+
5+
from pydantic import Field
6+
7+
from guidellm.backend import ResponseSummary
8+
from guidellm.benchmark.benchmark import BENCH, Benchmark, GenerativeBenchmark
9+
from guidellm.objects import Serializable
10+
from guidellm.request import GenerationRequest
11+
from guidellm.scheduler import (
12+
REQ,
13+
RES,
14+
SchedulerResult,
15+
)
16+
17+
__all__ = [
18+
"AGG",
19+
"BenchmarkAggregator",
20+
"GenerativeBenchmarkAggregator",
21+
]
22+
23+
24+
class BenchmarkAggregator(Generic[BENCH, REQ, RES], ABC, Serializable):
25+
created_requests: int = 0
26+
queued_requests: int = 0
27+
scheduled_requests: int = 0
28+
processing_requests: int = 0
29+
completed_requests: int = 0
30+
successful_requests: int = 0
31+
errored_requests: int = 0
32+
33+
queued_time: float = 0.0
34+
scheduled_time: float = 0.0
35+
worker_time: float = 0.0
36+
targeted_worker_start_delay: float = 0.0
37+
process_idle_time: DefaultDict[int, float] = defaultdict(float)
38+
process_idle_time_scratch: DefaultDict[int, float] = defaultdict(float)
39+
40+
def add_base_result(
41+
self, result: SchedulerResult[REQ, RES], is_error: bool = False
42+
):
43+
self.created_requests = result.run_info.created_requests
44+
self.queued_requests = result.run_info.queued_requests
45+
self.scheduled_requests = result.run_info.scheduled_requests
46+
self.processing_requests = result.run_info.processing_requests
47+
self.completed_requests = result.run_info.completed_requests
48+
49+
if result.type_ != "request_complete":
50+
return
51+
52+
if is_error:
53+
self.errored_requests += 1
54+
else:
55+
self.successful_requests += 1
56+
57+
self.queued_time += (
58+
result.request_info.scheduled_time - result.request_info.queued_time
59+
)
60+
self.scheduled_time += (
61+
result.request_info.worker_start - result.request_info.scheduled_time
62+
)
63+
64+
self.worker_time += (
65+
result.request_info.worker_end - result.request_info.worker_start
66+
)
67+
self.worker_schedule_delay_total += (
68+
result.request_info.worker_start - result.request_info.targeted_start_time
69+
)
70+
71+
first_process_request = (
72+
result.request_info.process_id not in self.process_idle_time_scratch
73+
)
74+
if not first_process_request:
75+
self.process_idle_time_scratch[result.request_info.process_id] -= (
76+
result.request_info.worker_start
77+
)
78+
self.process_idle_time[result.request_info.process_id] = (
79+
self.process_idle_time_scratch[result.request_info.process_id]
80+
)
81+
self.process_idle_time_scratch[result.request_info.process_id] += (
82+
result.request_info.worker_end
83+
)
84+
85+
def add_result(self, result: SchedulerResult[REQ, RES]):
86+
self.add_base_result(result)
87+
88+
@abstractmethod
89+
def compile(self) -> Benchmark[BENCH]: ...
90+
91+
92+
AGG = TypeVar("AGG", bound=BenchmarkAggregator)
93+
94+
95+
class GenerativeBenchmarkAggregator(
96+
BenchmarkAggregator[GenerativeBenchmark, GenerationRequest, ResponseSummary]
97+
):
98+
results: List[SchedulerResult[GenerationRequest, ResponseSummary]] = Field(
99+
default_factory=list,
100+
description="The list of results for the benchmark.",
101+
)
102+
103+
request_time_total: float = 0.0
104+
targeted_request_delay_total: float = 0.0
105+
time_to_first_token_total: float = 0.0
106+
inter_token_latency_total: float = 0.0
107+
prompt_tokens_total: int = 0
108+
output_tokens_total: int = 0
109+
110+
def add_result(self, result: SchedulerResult[GenerationRequest, ResponseSummary]):
111+
is_error = bool(result.response.error)
112+
self.add_base_result(result, is_error=is_error)
113+
114+
if result.type_ != "request_complete":
115+
return
116+
117+
self.results.append(result)
118+
119+
if not is_error:
120+
self.request_time_total += (result.response.end_time or 0.0) - (
121+
result.response.start_time or 0.0
122+
)
123+
self.targeted_request_delay_total += (result.response.start_time or 0.0) - (
124+
result.request_info.targeted_start_time or 0.0
125+
)
126+
self.time_to_first_token_total += (
127+
result.response.first_iter_time or 0.0
128+
) - (result.response.start_time or 0.0)
129+
self.inter_token_latency_total += (
130+
result.response.last_iter_time or 0.0
131+
) - (result.response.first_iter_time or 0.0)
132+
self.prompt_tokens_total += result.response.prompt_tokens or 0
133+
self.output_tokens_total += result.response.output_tokens or 0
134+
135+
def compile(self) -> GenerativeBenchmark:
136+
pass # TODO

0 commit comments

Comments
 (0)