Skip to content

Commit 10904e6

Browse files
authored
[benchmark] Port benchmark request sent optimization to benchmark_serving (#21209)
Signed-off-by: Jialin Ouyang <[email protected]>
1 parent a322376 commit 10904e6

File tree

2 files changed

+7
-101
lines changed

2 files changed

+7
-101
lines changed

benchmarks/benchmark_serving.py

Lines changed: 2 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
import random
3131
import time
3232
import warnings
33-
from collections.abc import AsyncGenerator, Iterable
33+
from collections.abc import Iterable
3434
from dataclasses import dataclass
3535
from datetime import datetime
3636
from typing import Any, Literal, Optional
@@ -73,6 +73,7 @@
7373
VisionArenaDataset,
7474
)
7575
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
76+
from vllm.benchmarks.serve import get_request
7677

7778
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
7879

@@ -107,101 +108,6 @@ class BenchmarkMetrics:
107108
percentiles_e2el_ms: list[tuple[float, float]]
108109

109110

110-
def _get_current_request_rate(
111-
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
112-
ramp_up_start_rps: Optional[int],
113-
ramp_up_end_rps: Optional[int],
114-
request_index: int,
115-
total_requests: int,
116-
request_rate: float,
117-
) -> float:
118-
if (
119-
ramp_up_strategy
120-
and ramp_up_start_rps is not None
121-
and ramp_up_end_rps is not None
122-
):
123-
progress = request_index / max(total_requests - 1, 1)
124-
if ramp_up_strategy == "linear":
125-
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
126-
return ramp_up_start_rps + increase
127-
elif ramp_up_strategy == "exponential":
128-
ratio = ramp_up_end_rps / ramp_up_start_rps
129-
return ramp_up_start_rps * (ratio**progress)
130-
else:
131-
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
132-
return request_rate
133-
134-
135-
async def get_request(
136-
input_requests: list[SampleRequest],
137-
request_rate: float,
138-
burstiness: float = 1.0,
139-
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
140-
ramp_up_start_rps: Optional[int] = None,
141-
ramp_up_end_rps: Optional[int] = None,
142-
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
143-
"""
144-
Asynchronously generates requests at a specified rate
145-
with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
146-
147-
Args:
148-
input_requests:
149-
A list of input requests, each represented as a SampleRequest.
150-
request_rate:
151-
The rate at which requests are generated (requests/s).
152-
burstiness (optional):
153-
The burstiness factor of the request generation.
154-
Only takes effect when request_rate is not inf.
155-
Default value is 1, which follows a Poisson process.
156-
Otherwise, the request intervals follow a gamma distribution.
157-
A lower burstiness value (0 < burstiness < 1) results
158-
in more bursty requests, while a higher burstiness value
159-
(burstiness > 1) results in a more uniform arrival of requests.
160-
ramp_up_strategy (optional):
161-
The ramp-up strategy. Can be "linear" or "exponential".
162-
If None, uses constant request rate (specified by request_rate).
163-
ramp_up_start_rps (optional):
164-
The starting request rate for ramp-up.
165-
ramp_up_end_rps (optional):
166-
The ending request rate for ramp-up.
167-
"""
168-
assert burstiness > 0, (
169-
f"A positive burstiness factor is expected, but given {burstiness}."
170-
)
171-
# Convert to list to get length for ramp-up calculations
172-
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
173-
input_requests = list(input_requests)
174-
175-
total_requests = len(input_requests)
176-
request_index = 0
177-
178-
for request in input_requests:
179-
current_request_rate = _get_current_request_rate(
180-
ramp_up_strategy,
181-
ramp_up_start_rps,
182-
ramp_up_end_rps,
183-
request_index,
184-
total_requests,
185-
request_rate,
186-
)
187-
188-
yield request, current_request_rate
189-
190-
request_index += 1
191-
192-
if current_request_rate == float("inf"):
193-
# If the request rate is infinity, then we don't need to wait.
194-
continue
195-
196-
theta = 1.0 / (current_request_rate * burstiness)
197-
198-
# Sample the request interval from the gamma distribution.
199-
# If burstiness is 1, it follows exponential distribution.
200-
interval = np.random.gamma(shape=burstiness, scale=theta)
201-
# The next request will be sent after the interval.
202-
await asyncio.sleep(interval)
203-
204-
205111
def calculate_metrics(
206112
input_requests: list[SampleRequest],
207113
outputs: list[RequestFuncOutput],

vllm/benchmarks/serve.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,12 @@ async def get_request(
179179
delay_ts = [delay * normalize_factor for delay in delay_ts]
180180

181181
start_ts = time.time()
182-
request_index = 0
183182
for request_index, request in enumerate(input_requests):
184-
current_ts = time.time()
185-
sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
186-
if sleep_interval_s > 0:
187-
await asyncio.sleep(sleep_interval_s)
183+
if delay_ts[request_index] > 0:
184+
current_ts = time.time()
185+
sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
186+
if sleep_interval_s > 0:
187+
await asyncio.sleep(sleep_interval_s)
188188
yield request, request_rates[request_index]
189189

190190

0 commit comments

Comments
 (0)