@@ -291,9 +291,8 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
291291        model_config  =  vllm_config .model_config 
292292        cache_config  =  vllm_config .cache_config 
293293
294-         # Set mamba block size to max_model_len (this may get 
295-         # override by prefix caching logic later) 
296-         cache_config .mamba_block_size  =  model_config .max_model_len 
294+         if  cache_config .mamba_block_size  is  None :
295+             cache_config .mamba_block_size  =  model_config .max_model_len 
297296
298297        if  cache_config .enable_prefix_caching :
299298            if  model_config .supports_mamba_prefix_caching :
@@ -333,6 +332,8 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
333332        if  not  envs .VLLM_USE_V1 :
334333            return 
335334
335+         # Save the user input before it gets modified by MambaModelConfig 
336+         mamba_block_size  =  vllm_config .cache_config .mamba_block_size 
336337        # Enable FULL_AND_PIECEWISE by default 
337338        MambaModelConfig .verify_and_update_config (vllm_config )
338339
@@ -386,7 +387,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
386387            # With prefix caching, select attention block size to 
387388            # optimize for mamba kernel performance 
388389
389-             # mamba  SSD kernel uses a chunk_size, e.g. 256 
390+             # Mamba2  SSD kernel uses a chunk_size, e.g. 256 
390391            # Align the block to the kernel: use lowest multiple of chunk_size 
391392            # of attention tokens that would fit mamba_page_size: 
392393            # e.g. for mamba page size = 788kB 
@@ -404,7 +405,8 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
404405            def  lcm (a , b ):
405406                return  a  *  b  //  gcd (a , b )
406407
407-             base_chunk_size  =  model_config .get_mamba_chunk_size ()
408+             base_chunk_size  =  mamba_block_size  or  model_config .get_mamba_chunk_size ()
409+ 
408410            attn_tokens_per_mamba_state  =  cdiv (mamba_page_size , attn_page_size_1_token )
409411
410412            chunk_size  =  lcm (base_chunk_size , kernel_block_alignment_size )
0 commit comments