Skip to content

Commit 88074ea

Browse files
authored
[Bench] Add "per-gpu-workload" mode (#3068)
This PR introduces the per-gpu-workload mode to MLC bench. Under this mode, the specified "num_concurrent_requests" and "request_rate" denote the workload **per GPU**, which means the overall workload of the entire serving system for benchmarking will be multiplied by the number of GPUs. Meanwhile, this PR deprecates the argument `--testset-name` in favor of `--dataset-path` for Loogle dataset.
1 parent 88ebe6f commit 88074ea

File tree

3 files changed

+32
-22
lines changed

3 files changed

+32
-22
lines changed

python/mlc_llm/bench/__main__.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,18 @@ def run_pipeline(
101101
args.output_len_std,
102102
)
103103
request_records = pipeline(request_records)
104-
assert len(request_records) == args.num_requests * args.num_gpus
105-
sorted_requests: List[RequestRecord] = [None] * args.num_requests * args.num_gpus
104+
num_total_requests = (
105+
args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus
106+
)
107+
assert len(request_records) == num_total_requests
108+
sorted_requests: List[RequestRecord] = [None] * num_total_requests
106109
for request_record in request_records:
107110
assert request_record.request_id is not None
108111
assert sorted_requests[request_record.request_id] is None
109112
sorted_requests[request_record.request_id] = request_record
110113

111114
request_records = MetricAnalyzer(tokenizer)(request_records)
112-
report = generate_metrics_summary(
113-
request_records, args.num_requests * args.num_gpus, args.num_gpus
114-
)
115+
report = generate_metrics_summary(request_records, num_total_requests, args.num_gpus)
115116
return report, sorted_requests
116117

117118

@@ -221,6 +222,15 @@ def _main():
221222
help="The number of requests for warmup. "
222223
"It is optional when fixing the number of concurrent requests, and is required otherwise.",
223224
)
225+
parser.add_argument(
226+
"--per-gpu-workload",
227+
default=False,
228+
action="store_true",
229+
help='When set to True, the specified "num_concurrent_requests"/"request_rate" '
230+
"denote the workload **per GPU**, which means that the real values of "
231+
'"num_concurrent_requests"/"request_rate" used in benchmark'
232+
'will be multiplied by "num_gpus".',
233+
)
224234
parser.add_argument(
225235
"--num-concurrent-requests",
226236
type=_parse_num_concurrent_requests,
@@ -354,13 +364,6 @@ def _main():
354364
type=_parse_mlc_engine_config,
355365
help="The engine config used when launch MLC server.",
356366
)
357-
parser.add_argument(
358-
"--output",
359-
"-o",
360-
type=str,
361-
default="mlc_benchmark.csv",
362-
help="The path of the output file where to dump the benchmark results.",
363-
)
364367
parser.add_argument(
365368
"--cuda-profile",
366369
default=False,
@@ -378,13 +381,16 @@ def _main():
378381
"--multi-round",
379382
default=False,
380383
action="store_true",
381-
help="Whether to chat like mulit round conversion with history log each request. "
384+
help="Whether to chat like multi round conversion with history log each request. "
382385
"Only enabled when benchmarked with fixed concurrent request mode."
383386
"The --num-concurrent-requests should be provided when enabling this option.",
384387
)
385-
386388
parser.add_argument(
387-
"--testset-name", type=str, help="The name of the testset. Only used for Loogle dataset"
389+
"--output",
390+
"-o",
391+
type=str,
392+
default="mlc_benchmark.csv",
393+
help="The path of the output file where to dump the benchmark results.",
388394
)
389395

390396
main(parser.parse_args())

python/mlc_llm/bench/dataset.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,11 @@ class LoogleDataset(Dataset): # pylint: disable=too-few-public-methods
174174
# pylint: enable=line-too-long
175175
require_fake_warmup: bool = True
176176

177-
def __init__(self, tokenizer: AutoTokenizer, testset_name) -> None:
177+
def __init__(self, tokenizer: AutoTokenizer, testset_name: str) -> None:
178178
raw_dataset = load_dataset("bigainlco/LooGLE", testset_name, split="test")
179179
self.tokenizer = tokenizer
180180
self.dataset = []
181181
self.prompt_format = self.task2prompt[testset_name]
182-
# self.max_gen = self.task2maxlen[testset_name]
183182
prompts = []
184183
generate_lens = []
185184
questions = []
@@ -806,7 +805,7 @@ def create_dataset(args: argparse.Namespace, tokenizer: AutoTokenizer) -> "Datas
806805
assert (
807806
args.apply_chat_template is False
808807
), "Loogle dataset does not support applying chat template"
809-
return LoogleDataset(tokenizer, args.testset_name)
808+
return LoogleDataset(tokenizer, testset_name=args.dataset_path)
810809
if args.dataset == "react":
811810
assert (
812811
args.apply_chat_template is False

python/mlc_llm/bench/request_processor.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -622,22 +622,27 @@ def create_pipelines(
622622
"Please specify the number of warmup requests via "
623623
'"--num-warmup-requests" when fixing request rate.'
624624
)
625+
num_total_requests = int(
626+
args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus
627+
)
625628
if dataset.require_fake_warmup:
626-
num_samples = int(args.num_requests * args.num_gpus)
629+
num_samples = num_total_requests
627630
else:
628-
num_samples = int(args.num_requests * args.num_gpus) + args.num_warmup_requests
631+
num_samples = num_total_requests + args.num_warmup_requests
629632
return [
630633
SequentialProcessor(
631634
LogMessage(f"Fixing request rate: {request_rate}"),
632635
SampleRequests(num_samples),
633636
AttachModelName(args.tokenizer),
634-
AttachRequestRateTimestamp(request_rate * args.num_gpus),
637+
AttachRequestRateTimestamp(
638+
request_rate if not args.per_gpu_workload else request_rate * args.num_gpus
639+
),
635640
AttachStreamFlag(args.stream),
636641
AttachSamplingOptions(args.temperature, args.top_p, args.ignore_eos),
637642
AttachExecutionFeature({"request_rate": float(request_rate)}),
638643
WarmupAndRun(
639644
num_warmup_requests=args.num_warmup_requests,
640-
num_benchmark_requests=int(args.num_requests * args.num_gpus),
645+
num_benchmark_requests=num_total_requests,
641646
pipeline=FixTimestampExecutor(
642647
f_create_api_endpoint,
643648
args.num_process_workers,

0 commit comments

Comments
 (0)