|
30 | 30 | import random
|
31 | 31 | import time
|
32 | 32 | import warnings
|
33 |
| -from collections.abc import AsyncGenerator, Iterable |
| 33 | +from collections.abc import Iterable |
34 | 34 | from dataclasses import dataclass
|
35 | 35 | from datetime import datetime
|
36 | 36 | from typing import Any, Literal, Optional
|
|
73 | 73 | VisionArenaDataset,
|
74 | 74 | )
|
75 | 75 | from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
| 76 | +from vllm.benchmarks.serve import get_request |
76 | 77 |
|
77 | 78 | MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
78 | 79 |
|
@@ -107,101 +108,6 @@ class BenchmarkMetrics:
|
107 | 108 | percentiles_e2el_ms: list[tuple[float, float]]
|
108 | 109 |
|
109 | 110 |
|
110 |
| -def _get_current_request_rate( |
111 |
| - ramp_up_strategy: Optional[Literal["linear", "exponential"]], |
112 |
| - ramp_up_start_rps: Optional[int], |
113 |
| - ramp_up_end_rps: Optional[int], |
114 |
| - request_index: int, |
115 |
| - total_requests: int, |
116 |
| - request_rate: float, |
117 |
| -) -> float: |
118 |
| - if ( |
119 |
| - ramp_up_strategy |
120 |
| - and ramp_up_start_rps is not None |
121 |
| - and ramp_up_end_rps is not None |
122 |
| - ): |
123 |
| - progress = request_index / max(total_requests - 1, 1) |
124 |
| - if ramp_up_strategy == "linear": |
125 |
| - increase = (ramp_up_end_rps - ramp_up_start_rps) * progress |
126 |
| - return ramp_up_start_rps + increase |
127 |
| - elif ramp_up_strategy == "exponential": |
128 |
| - ratio = ramp_up_end_rps / ramp_up_start_rps |
129 |
| - return ramp_up_start_rps * (ratio**progress) |
130 |
| - else: |
131 |
| - raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") |
132 |
| - return request_rate |
133 |
| - |
134 |
| - |
135 |
| -async def get_request( |
136 |
| - input_requests: list[SampleRequest], |
137 |
| - request_rate: float, |
138 |
| - burstiness: float = 1.0, |
139 |
| - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, |
140 |
| - ramp_up_start_rps: Optional[int] = None, |
141 |
| - ramp_up_end_rps: Optional[int] = None, |
142 |
| -) -> AsyncGenerator[tuple[SampleRequest, float], None]: |
143 |
| - """ |
144 |
| - Asynchronously generates requests at a specified rate |
145 |
| - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. |
146 |
| -
|
147 |
| - Args: |
148 |
| - input_requests: |
149 |
| - A list of input requests, each represented as a SampleRequest. |
150 |
| - request_rate: |
151 |
| - The rate at which requests are generated (requests/s). |
152 |
| - burstiness (optional): |
153 |
| - The burstiness factor of the request generation. |
154 |
| - Only takes effect when request_rate is not inf. |
155 |
| - Default value is 1, which follows a Poisson process. |
156 |
| - Otherwise, the request intervals follow a gamma distribution. |
157 |
| - A lower burstiness value (0 < burstiness < 1) results |
158 |
| - in more bursty requests, while a higher burstiness value |
159 |
| - (burstiness > 1) results in a more uniform arrival of requests. |
160 |
| - ramp_up_strategy (optional): |
161 |
| - The ramp-up strategy. Can be "linear" or "exponential". |
162 |
| - If None, uses constant request rate (specified by request_rate). |
163 |
| - ramp_up_start_rps (optional): |
164 |
| - The starting request rate for ramp-up. |
165 |
| - ramp_up_end_rps (optional): |
166 |
| - The ending request rate for ramp-up. |
167 |
| - """ |
168 |
| - assert burstiness > 0, ( |
169 |
| - f"A positive burstiness factor is expected, but given {burstiness}." |
170 |
| - ) |
171 |
| - # Convert to list to get length for ramp-up calculations |
172 |
| - if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): |
173 |
| - input_requests = list(input_requests) |
174 |
| - |
175 |
| - total_requests = len(input_requests) |
176 |
| - request_index = 0 |
177 |
| - |
178 |
| - for request in input_requests: |
179 |
| - current_request_rate = _get_current_request_rate( |
180 |
| - ramp_up_strategy, |
181 |
| - ramp_up_start_rps, |
182 |
| - ramp_up_end_rps, |
183 |
| - request_index, |
184 |
| - total_requests, |
185 |
| - request_rate, |
186 |
| - ) |
187 |
| - |
188 |
| - yield request, current_request_rate |
189 |
| - |
190 |
| - request_index += 1 |
191 |
| - |
192 |
| - if current_request_rate == float("inf"): |
193 |
| - # If the request rate is infinity, then we don't need to wait. |
194 |
| - continue |
195 |
| - |
196 |
| - theta = 1.0 / (current_request_rate * burstiness) |
197 |
| - |
198 |
| - # Sample the request interval from the gamma distribution. |
199 |
| - # If burstiness is 1, it follows exponential distribution. |
200 |
| - interval = np.random.gamma(shape=burstiness, scale=theta) |
201 |
| - # The next request will be sent after the interval. |
202 |
| - await asyncio.sleep(interval) |
203 |
| - |
204 |
| - |
205 | 111 | def calculate_metrics(
|
206 | 112 | input_requests: list[SampleRequest],
|
207 | 113 | outputs: list[RequestFuncOutput],
|
|
0 commit comments