Skip to content

Commit e7f720e

Browse files
authored
[Misc]add coding benchmark for speculative decoding (#15303)
Signed-off-by: CXIAAAAA <[email protected]>
1 parent 4ae17bf commit e7f720e

File tree

3 files changed

+101
-21
lines changed

3 files changed

+101
-21
lines changed

benchmarks/benchmark_dataset.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -715,3 +715,66 @@ def sample(
715715
))
716716
self.maybe_oversample_requests(sampled_requests, num_requests)
717717
return sampled_requests
718+
719+
720+
# -----------------------------------------------------------------------------
721+
# Instruct Coder Dataset Implementation
722+
# -----------------------------------------------------------------------------
723+
724+
725+
class InstructCoderDataset(HuggingFaceDataset):
726+
"""
727+
InstructCoder Dataset.
728+
https://huggingface.co/datasets/likaixin/InstructCoder
729+
730+
InstructCoder is the dataset designed for general code editing.
731+
It consists of 114,239 instruction-input-output triplets,
732+
and covers multiple distinct code editing scenario.
733+
"""
734+
735+
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
736+
DEFAULT_NUM_REQUESTS = 1000
737+
INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
738+
739+
def __init__(
740+
self,
741+
**kwargs,
742+
) -> None:
743+
super().__init__(**kwargs)
744+
if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
745+
raise ValueError(f"Only support likaixin/InstructCoder dataset.\
746+
This data path {self.dataset_path} is not valid.")
747+
if self.dataset_subset is None and self.dataset_split != "train":
748+
raise ValueError("Dataset split must be 'train'.")
749+
750+
def load_data(self) -> None:
751+
dataset = load_dataset(
752+
self.dataset_path,
753+
name=self.dataset_subset,
754+
split=self.dataset_split,
755+
streaming=True,
756+
)
757+
self.data = dataset.shuffle(seed=self.random_seed)
758+
759+
def sample(self,
760+
tokenizer: PreTrainedTokenizerBase,
761+
num_requests: int,
762+
output_len: Optional[int] = None,
763+
enable_multimodal_chat: bool = False,
764+
**kwargs) -> list:
765+
output_len = (output_len
766+
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
767+
sampled_requests = []
768+
for item in self.data:
769+
if len(sampled_requests) >= num_requests:
770+
break
771+
prompt = f"{item['instruction']}:\n{item['input']}"
772+
prompt_len = len(tokenizer(prompt).input_ids)
773+
sampled_requests.append(
774+
SampleRequest(
775+
prompt=prompt,
776+
prompt_len=prompt_len,
777+
expected_output_len=output_len,
778+
))
779+
self.maybe_oversample_requests(sampled_requests, num_requests)
780+
return sampled_requests

benchmarks/benchmark_serving.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@
5353
from argparse import ArgumentParser as FlexibleArgumentParser
5454

5555
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
56-
RandomDataset, SampleRequest, ShareGPTDataset,
57-
SonnetDataset, VisionArenaDataset)
56+
InstructCoderDataset, RandomDataset,
57+
SampleRequest, ShareGPTDataset, SonnetDataset,
58+
VisionArenaDataset)
5859
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
5960

6061
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -588,9 +589,14 @@ def main(args: argparse.Namespace):
588589
elif args.dataset_name == "hf":
589590
# Choose between VisionArenaDataset
590591
# and HuggingFaceDataset based on provided parameters.
591-
dataset_class = (VisionArenaDataset if args.dataset_path
592-
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
593-
and args.hf_subset is None else HuggingFaceDataset)
592+
dataset_class = HuggingFaceDataset
593+
if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
594+
assert args.hf_subset is None, "VisionArenaDataset needs hf_subset to be None." #noqa: E501
595+
dataset_class = VisionArenaDataset
596+
elif args.dataset_path == "likaixin/InstructCoder":
597+
dataset_class = InstructCoderDataset
598+
args.hf_split = "train"
599+
594600
input_requests = dataset_class(
595601
dataset_path=args.dataset_path,
596602
dataset_subset=args.hf_subset,

benchmarks/benchmark_throughput.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
import torch
1313
import uvloop
1414
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
15-
RandomDataset, SampleRequest, ShareGPTDataset,
16-
SonnetDataset, VisionArenaDataset)
15+
InstructCoderDataset, RandomDataset,
16+
SampleRequest, ShareGPTDataset, SonnetDataset,
17+
VisionArenaDataset)
1718
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
1819
from tqdm import tqdm
1920
from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -300,6 +301,7 @@ def get_requests(args, tokenizer):
300301
"input_len": args.input_len,
301302
"output_len": args.output_len,
302303
}
304+
303305
if args.dataset_path is None or args.dataset_name == "random":
304306
sample_kwargs["range_ratio"] = args.random_range_ratio
305307
sample_kwargs["prefix_len"] = args.prefix_len
@@ -317,17 +319,21 @@ def get_requests(args, tokenizer):
317319
elif args.dataset_name == "burstgpt":
318320
dataset_cls = BurstGPTDataset
319321
elif args.dataset_name == "hf":
320-
if args.backend != "vllm-chat":
321-
raise ValueError(
322-
"hf datasets only are supported by vllm-chat backend")
323-
# Choose between VisionArenaDataset and HuggingFaceDataset based on
324-
# provided parameters.
325-
dataset_cls = (VisionArenaDataset if args.dataset_path
326-
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
327-
and args.hf_subset is None else HuggingFaceDataset)
328-
common_kwargs['dataset_subset'] = args.hf_subset
329-
common_kwargs['dataset_split'] = args.hf_split
330-
sample_kwargs["enable_multimodal_chat"] = True
322+
if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
323+
if args.args.backend == "vllm-chat":
324+
raise ValueError(
325+
"hf datasets only are supported by vllm-chat backend")
326+
# Choose between VisionArenaDataset and HuggingFaceDataset based on
327+
# provided parameters.
328+
dataset_cls = (VisionArenaDataset if args.dataset_path
329+
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
330+
and args.hf_subset is None else HuggingFaceDataset)
331+
common_kwargs['dataset_subset'] = args.hf_subset
332+
common_kwargs['dataset_split'] = args.hf_split
333+
sample_kwargs["enable_multimodal_chat"] = True
334+
elif args.dataset_path == "likaixin/InstructCoder":
335+
dataset_cls = InstructCoderDataset
336+
common_kwargs['dataset_split'] = "train"
331337

332338
else:
333339
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@@ -462,9 +468,14 @@ def validate_args(args):
462468
warnings.warn("--hf-subset and --hf-split will be ignored \
463469
since --dataset-name is not 'hf'.",
464470
stacklevel=2)
465-
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
466-
raise ValueError(
467-
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
471+
elif args.dataset_name == "hf":
472+
if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
473+
assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
474+
elif args.dataset_path == "likaixin/InstructCoder":
475+
assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
476+
else:
477+
raise ValueError(
478+
f"{args.dataset_path} is not supported by hf dataset.")
468479

469480
# --random-range-ratio: only used when dataset_name is 'random'
470481
if args.dataset_name != 'random' and args.random_range_ratio is not None:

0 commit comments

Comments
 (0)