@@ -101,17 +101,18 @@ def run_pipeline(
101
101
args .output_len_std ,
102
102
)
103
103
request_records = pipeline (request_records )
104
- assert len (request_records ) == args .num_requests * args .num_gpus
105
- sorted_requests : List [RequestRecord ] = [None ] * args .num_requests * args .num_gpus
104
+ num_total_requests = (
105
+ args .num_requests if not args .per_gpu_workload else args .num_requests * args .num_gpus
106
+ )
107
+ assert len (request_records ) == num_total_requests
108
+ sorted_requests : List [RequestRecord ] = [None ] * num_total_requests
106
109
for request_record in request_records :
107
110
assert request_record .request_id is not None
108
111
assert sorted_requests [request_record .request_id ] is None
109
112
sorted_requests [request_record .request_id ] = request_record
110
113
111
114
request_records = MetricAnalyzer (tokenizer )(request_records )
112
- report = generate_metrics_summary (
113
- request_records , args .num_requests * args .num_gpus , args .num_gpus
114
- )
115
+ report = generate_metrics_summary (request_records , num_total_requests , args .num_gpus )
115
116
return report , sorted_requests
116
117
117
118
@@ -221,6 +222,15 @@ def _main():
221
222
help = "The number of requests for warmup. "
222
223
"It is optional when fixing the number of concurrent requests, and is required otherwise." ,
223
224
)
225
+ parser .add_argument (
226
+ "--per-gpu-workload" ,
227
+ default = False ,
228
+ action = "store_true" ,
229
+ help = 'When set to True, the specified "num_concurrent_requests"/"request_rate" '
230
+ "denote the workload **per GPU**, which means that the real values of "
231
+ '"num_concurrent_requests"/"request_rate" used in benchmark'
232
+ 'will be multiplied by "num_gpus".' ,
233
+ )
224
234
parser .add_argument (
225
235
"--num-concurrent-requests" ,
226
236
type = _parse_num_concurrent_requests ,
@@ -354,13 +364,6 @@ def _main():
354
364
type = _parse_mlc_engine_config ,
355
365
help = "The engine config used when launch MLC server." ,
356
366
)
357
- parser .add_argument (
358
- "--output" ,
359
- "-o" ,
360
- type = str ,
361
- default = "mlc_benchmark.csv" ,
362
- help = "The path of the output file where to dump the benchmark results." ,
363
- )
364
367
parser .add_argument (
365
368
"--cuda-profile" ,
366
369
default = False ,
@@ -378,13 +381,16 @@ def _main():
378
381
"--multi-round" ,
379
382
default = False ,
380
383
action = "store_true" ,
381
- help = "Whether to chat like mulit round conversion with history log each request. "
384
+ help = "Whether to chat like multi round conversion with history log each request. "
382
385
"Only enabled when benchmarked with fixed concurrent request mode."
383
386
"The --num-concurrent-requests should be provided when enabling this option." ,
384
387
)
385
-
386
388
parser .add_argument (
387
- "--testset-name" , type = str , help = "The name of the testset. Only used for Loogle dataset"
389
+ "--output" ,
390
+ "-o" ,
391
+ type = str ,
392
+ default = "mlc_benchmark.csv" ,
393
+ help = "The path of the output file where to dump the benchmark results." ,
388
394
)
389
395
390
396
main (parser .parse_args ())
0 commit comments