@@ -77,10 +77,7 @@ def __init__(
7777 self ._add_special_tokens = config .add_special_tokens if config .add_special_tokens is not None else False
7878 self ._tokenizer = self ._create_auto_tokenizer (config , env_config )
7979
80- if config .max_model_length is not None :
81- self ._max_length = int (config .max_model_length )
82- else :
83- self ._max_length = self .tokenizer .model_max_length or self .tokenizer .max_position_embeddings
80+ self ._max_length = int (config .max_model_length ) if config .max_model_length is not None else None
8481
8582 # If model_parallel is not set we compare the number of processes with the number of GPUs
8683 self .model = self ._create_auto_model (config , env_config )
@@ -152,6 +149,13 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
152149 return None
153150
154151 model = LLM (** self .model_args )
152+
153+ # If the max_length can't get extracted from the config, it will be inferred from the model
154+ # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
155+ # config and tk config, like mistralai/Mistral-7B-v0.1
156+ if self ._max_length is None :
157+ self ._max_length = model .llm_engine .model_config .max_seq_len_to_capture
158+
155159 return model
156160
157161 def _create_auto_tokenizer (self , config : VLLMModelConfig , env_config : EnvConfig ):
@@ -164,36 +168,6 @@ def _create_auto_tokenizer(self, config: VLLMModelConfig, env_config: EnvConfig)
164168 tokenizer .pad_token = tokenizer .eos_token
165169 return tokenizer
166170
167- def _init_max_length (self , max_length ) -> int :
168- """Return the maximum sequence length of the model.
169- NOTE: Different model configurations have different max sequence length
170- attribute names.
171- - n_positions: (CTRLConfig)
172- - max_position_embeddings: (BartConfig, RoFormerConfig)
173- - n_ctx: (GPT2Config)
174- NOTE: For relative position encoded models you should specify the max
175- sequence length of the model in the constructor via `max_length`.
176-
177- Args:
178- max_length (Optional[int]): The maximum length of the input sequence. If not provided, it will be determined
179- based on the model's configuration or tokenizer's model_max_length attribute.
180-
181- Returns:
182- int: Max length to use depending on the available args and config
183- """
184- if max_length is not None :
185- return int (max_length )
186- # Try to get the sequence length from the model config.
187- seqlen_config_attrs = ("n_positions" , "max_position_embeddings" , "n_ctx" )
188-
189- for attr in seqlen_config_attrs :
190- if hasattr (self ._config , attr ):
191- return getattr (self ._config , attr )
192-
193- # Default max sequence length setting for when no `max_length` is provided
194- # or no max length config setting is found in the model or tokenizer.
195- return 2048
196-
197171 def greedy_until (
198172 self ,
199173 requests : list [GreedyUntilRequest ],
@@ -300,7 +274,7 @@ def _generate(
300274 """Contains the actual logic of the generation."""
301275 if generate :
302276 sampling_params = SamplingParams (
303- temperature = 1.0 if num_samples > 1 else 0.0 ,
277+ temperature = float ( self . _config . temperature ) if num_samples > 1 else 0.0 ,
304278 n = num_samples ,
305279 max_tokens = max_new_tokens ,
306280 stop = stop_tokens ,
0 commit comments