@@ -681,21 +681,34 @@ def hf_runner():
681
681
682
682
683
683
class VllmRunner :
684
+ """
685
+ The default value of some arguments have been modified from
686
+ :class:`~vllm.LLM` as follows:
687
+ - `trust_remote_code`: Set to `True` instead of `False` for convenience.
688
+ - `seed`: Set to `0` instead of `None` for test reproducibility.
689
+ - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
690
+ - `block_size`: Set to `16` instead of `None` to reduce memory usage.
691
+ - `enable_chunked_prefill`: Set to `False` instead of `None` for
692
+ test reproducibility.
693
+ - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
694
+ """
684
695
685
696
def __init__ (
686
697
self ,
687
698
model_name : str ,
688
699
task : TaskOption = "auto" ,
689
700
tokenizer_name : Optional [str ] = None ,
690
701
tokenizer_mode : str = "auto" ,
702
+ trust_remote_code : bool = True ,
703
+ seed : Optional [int ] = 0 ,
691
704
# Use smaller max model length, otherwise bigger model cannot run due
692
705
# to kv cache size limit.
693
706
max_model_len : int = 1024 ,
694
707
dtype : str = "half" ,
695
708
disable_log_stats : bool = True ,
696
709
tensor_parallel_size : int = 1 ,
697
710
block_size : int = 16 ,
698
- enable_chunked_prefill : bool = False ,
711
+ enable_chunked_prefill : Optional [ bool ] = False ,
699
712
swap_space : int = 4 ,
700
713
enforce_eager : Optional [bool ] = False ,
701
714
** kwargs ,
@@ -705,8 +718,9 @@ def __init__(
705
718
task = task ,
706
719
tokenizer = tokenizer_name ,
707
720
tokenizer_mode = tokenizer_mode ,
708
- trust_remote_code = True ,
721
+ trust_remote_code = trust_remote_code ,
709
722
dtype = dtype ,
723
+ seed = seed ,
710
724
swap_space = swap_space ,
711
725
enforce_eager = enforce_eager ,
712
726
disable_log_stats = disable_log_stats ,
0 commit comments