12
12
import torch
13
13
import uvloop
14
14
from benchmark_dataset import (BurstGPTDataset , HuggingFaceDataset ,
15
- RandomDataset , SampleRequest , ShareGPTDataset ,
16
- SonnetDataset , VisionArenaDataset )
15
+ InstructCoderDataset , RandomDataset ,
16
+ SampleRequest , ShareGPTDataset , SonnetDataset ,
17
+ VisionArenaDataset )
17
18
from benchmark_utils import convert_to_pytorch_benchmark_format , write_to_json
18
19
from tqdm import tqdm
19
20
from transformers import (AutoModelForCausalLM , AutoTokenizer ,
@@ -300,6 +301,7 @@ def get_requests(args, tokenizer):
300
301
"input_len" : args .input_len ,
301
302
"output_len" : args .output_len ,
302
303
}
304
+
303
305
if args .dataset_path is None or args .dataset_name == "random" :
304
306
sample_kwargs ["range_ratio" ] = args .random_range_ratio
305
307
sample_kwargs ["prefix_len" ] = args .prefix_len
@@ -317,17 +319,21 @@ def get_requests(args, tokenizer):
317
319
elif args .dataset_name == "burstgpt" :
318
320
dataset_cls = BurstGPTDataset
319
321
elif args .dataset_name == "hf" :
320
- if args .backend != "vllm-chat" :
321
- raise ValueError (
322
- "hf datasets only are supported by vllm-chat backend" )
323
- # Choose between VisionArenaDataset and HuggingFaceDataset based on
324
- # provided parameters.
325
- dataset_cls = (VisionArenaDataset if args .dataset_path
326
- == VisionArenaDataset .VISION_ARENA_DATASET_PATH
327
- and args .hf_subset is None else HuggingFaceDataset )
328
- common_kwargs ['dataset_subset' ] = args .hf_subset
329
- common_kwargs ['dataset_split' ] = args .hf_split
330
- sample_kwargs ["enable_multimodal_chat" ] = True
322
+ if args .dataset_path == VisionArenaDataset .VISION_ARENA_DATASET_PATH :
323
+ if args .args .backend == "vllm-chat" :
324
+ raise ValueError (
325
+ "hf datasets only are supported by vllm-chat backend" )
326
+ # Choose between VisionArenaDataset and HuggingFaceDataset based on
327
+ # provided parameters.
328
+ dataset_cls = (VisionArenaDataset if args .dataset_path
329
+ == VisionArenaDataset .VISION_ARENA_DATASET_PATH
330
+ and args .hf_subset is None else HuggingFaceDataset )
331
+ common_kwargs ['dataset_subset' ] = args .hf_subset
332
+ common_kwargs ['dataset_split' ] = args .hf_split
333
+ sample_kwargs ["enable_multimodal_chat" ] = True
334
+ elif args .dataset_path == "likaixin/InstructCoder" :
335
+ dataset_cls = InstructCoderDataset
336
+ common_kwargs ['dataset_split' ] = "train"
331
337
332
338
else :
333
339
raise ValueError (f"Unknown dataset name: { args .dataset_name } " )
@@ -462,9 +468,14 @@ def validate_args(args):
462
468
warnings .warn ("--hf-subset and --hf-split will be ignored \
463
469
since --dataset-name is not 'hf'." ,
464
470
stacklevel = 2 )
465
- elif args .dataset_name == "hf" and args .backend != "vllm-chat" :
466
- raise ValueError (
467
- "When --dataset-name is 'hf', backend must be 'vllm-chat'" )
471
+ elif args .dataset_name == "hf" :
472
+ if args .dataset_path == VisionArenaDataset .VISION_ARENA_DATASET_PATH :
473
+ assert args .backend == "vllm-chat" , "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
474
+ elif args .dataset_path == "likaixin/InstructCoder" :
475
+ assert args .backend == "vllm" , "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
476
+ else :
477
+ raise ValueError (
478
+ f"{ args .dataset_path } is not supported by hf dataset." )
468
479
469
480
# --random-range-ratio: only used when dataset_name is 'random'
470
481
if args .dataset_name != 'random' and args .random_range_ratio is not None :
0 commit comments