Skip to content

Commit e5f3880

Browse files
authored
Add prefix-cache dataset in mlc bench (#3065)
Add Loogle and React dataset. Add fake warmup option to avoid initalizing prefix cache.
1 parent 385cef2 commit e5f3880

File tree

5 files changed

+417
-23
lines changed

5 files changed

+417
-23
lines changed

python/mlc_llm/bench/__main__.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,17 @@ def run_pipeline(
101101
args.output_len_std,
102102
)
103103
request_records = pipeline(request_records)
104-
assert len(request_records) == args.num_requests
105-
sorted_requests: List[RequestRecord] = [None] * args.num_requests
104+
assert len(request_records) == args.num_requests * args.num_gpus
105+
sorted_requests: List[RequestRecord] = [None] * args.num_requests * args.num_gpus
106106
for request_record in request_records:
107107
assert request_record.request_id is not None
108108
assert sorted_requests[request_record.request_id] is None
109109
sorted_requests[request_record.request_id] = request_record
110110

111111
request_records = MetricAnalyzer(tokenizer)(request_records)
112-
report = generate_metrics_summary(request_records, args.num_requests, args.num_gpus)
112+
report = generate_metrics_summary(
113+
request_records, args.num_requests * args.num_gpus, args.num_gpus
114+
)
113115
return report, sorted_requests
114116

115117

@@ -135,7 +137,7 @@ def _main():
135137
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
136138
dataset = create_dataset(args, tokenizer)
137139
f_create_api_endpoint = functools.partial(create_api_endpoint, args)
138-
pipelines = create_pipelines(args, f_create_api_endpoint)
140+
pipelines = create_pipelines(args, f_create_api_endpoint, dataset)
139141
reports = []
140142
alltime_records = {}
141143
for i, pipeline in enumerate(pipelines):
@@ -291,6 +293,7 @@ def _main():
291293
parser.add_argument(
292294
"--timeout",
293295
type=float,
296+
default=3 * 60 * 60,
294297
help="The timeout limit of each request.",
295298
)
296299
parser.add_argument(
@@ -380,4 +383,8 @@ def _main():
380383
"The --num-concurrent-requests should be provided when enabling this option.",
381384
)
382385

386+
parser.add_argument(
387+
"--testset-name", type=str, help="The name of the testset. Only used for Loogle dataset"
388+
)
389+
383390
main(parser.parse_args())

python/mlc_llm/bench/api_endpoint.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def __init__( # pylint: disable=too-many-arguments
5757
async def __aenter__(self) -> Self:
5858
import aiohttp # pylint: disable=import-outside-toplevel,import-error
5959

60-
self.client = aiohttp.ClientSession()
60+
self.client = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(self.timeout))
6161
return self
6262

6363
async def __aexit__(self, exc_type, exc_value, tb) -> None:
@@ -249,7 +249,9 @@ async def __call__( # pylint: disable=too-many-branches,too-many-statements
249249
start_time = time.monotonic()
250250

251251
try:
252-
async with self.client.post(self.url, json=payload, headers=self.headers) as response:
252+
async with self.client.post(
253+
self.url, json=payload, headers=self.headers, timeout=3600
254+
) as response:
253255
assert response.status == 200, await response.text()
254256
if payload["stream"]:
255257
async for chunk in response.content:

0 commit comments

Comments
 (0)