@@ -173,14 +173,20 @@ def __init__(self,
173173 if self .enforce_eager is None :
174174 self .enforce_eager = False
175175
176- if (not self .disable_sliding_window
177- and self .hf_text_config .model_type == "gemma2"
178- and self .hf_text_config .sliding_window is not None ):
176+ sliding_window = getattr (self .hf_text_config , "sliding_window" , None )
177+ has_interleaved_attention = (sliding_window is not None ) and (
178+ isinstance (sliding_window , list ) or
179+ (self .hf_text_config .model_type in ["gemma2" ]))
180+
181+ if (not self .disable_sliding_window and has_interleaved_attention ):
182+ sliding_window_len_min = get_min_sliding_window (
183+ self .hf_text_config .sliding_window )
184+
179185 print_warning_once (
180- "Gemma 2 uses sliding window attention for every odd layer , "
186+ f" { self . hf_text_config . model_type } has interleaved attention, "
181187 "which is currently not supported by vLLM. Disabling sliding "
182188 "window and capping the max length to the sliding window size "
183- f"({ self . hf_text_config . sliding_window } )." )
189+ f"({ sliding_window_len_min } )." )
184190 self .disable_sliding_window = True
185191
186192 self .max_model_len = _get_and_verify_max_len (
@@ -431,7 +437,8 @@ def verify_with_parallel_config(
431437 "pipeline parallelism currently. Disabling it." )
432438 self .use_async_output_proc = False
433439
434- def get_hf_config_sliding_window (self ) -> Optional [int ]:
440+ def get_hf_config_sliding_window (
441+ self ) -> Union [Optional [int ], List [Optional [int ]]]:
435442 """Get the sliding window size, or None if disabled."""
436443
437444 # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
@@ -442,7 +449,7 @@ def get_hf_config_sliding_window(self) -> Optional[int]:
442449 return None
443450 return getattr (self .hf_text_config , "sliding_window" , None )
444451
445- def get_sliding_window (self ) -> Optional [int ]:
452+ def get_sliding_window (self ) -> Optional [Union [ int , List [ Optional [ int ]]] ]:
446453 """Get the sliding window size, or None if disabled.
447454 """
448455 # If user disables sliding window, return None.
@@ -1689,7 +1696,7 @@ def _get_and_verify_max_len(
16891696 hf_config : PretrainedConfig ,
16901697 max_model_len : Optional [int ],
16911698 disable_sliding_window : bool ,
1692- sliding_window_len : Optional [int ],
1699+ sliding_window_len : Optional [Union [ int , List [ Optional [ int ]]] ],
16931700 spec_target_max_model_len : Optional [int ] = None ,
16941701) -> int :
16951702 """Get and verify the model's maximum length."""
@@ -1722,9 +1729,12 @@ def _get_and_verify_max_len(
17221729 # If sliding window is manually disabled, max_length should be less
17231730 # than the sliding window length in the model config.
17241731 if disable_sliding_window and sliding_window_len is not None :
1732+
1733+ sliding_window_len_min = get_min_sliding_window (sliding_window_len )
17251734 max_len_key = "sliding_window" \
1726- if sliding_window_len < derived_max_model_len else max_len_key
1727- derived_max_model_len = min (derived_max_model_len , sliding_window_len )
1735+ if sliding_window_len_min < derived_max_model_len else max_len_key
1736+ derived_max_model_len = min (derived_max_model_len ,
1737+ sliding_window_len_min )
17281738
17291739 # If none of the keys were found in the config, use a default and
17301740 # log a warning.
@@ -1805,6 +1815,14 @@ def _get_and_verify_max_len(
18051815 return int (max_model_len )
18061816
18071817
1818+ def get_min_sliding_window (
1819+ sliding_window : Union [int , List [Optional [int ]]]) -> int :
1820+ if isinstance (sliding_window , list ):
1821+ return min (s for s in sliding_window if s is not None )
1822+
1823+ return sliding_window
1824+
1825+
18081826def get_served_model_name (model : str ,
18091827 served_model_name : Optional [Union [str , List [str ]]]):
18101828 """
0 commit comments