|
4 | 4 | from typing import TYPE_CHECKING
|
5 | 5 |
|
6 | 6 | import vllm.envs as envs
|
7 |
| -from vllm.config.compilation import CUDAGraphMode |
8 | 7 | from vllm.logger import init_logger
|
9 | 8 | from vllm.model_executor.models import ModelRegistry
|
10 | 9 | from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
|
@@ -290,7 +289,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
290 | 289 |
|
291 | 290 | model_config = vllm_config.model_config
|
292 | 291 | cache_config = vllm_config.cache_config
|
293 |
| - compilation_config = vllm_config.compilation_config |
294 | 292 |
|
295 | 293 | # Set mamba block size to max_model_len (this may get
|
296 | 294 | # override by prefix caching logic later)
|
@@ -320,19 +318,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
320 | 318 | "for hybrid models.")
|
321 | 319 | model_config.disable_cascade_attn = True
|
322 | 320 |
|
323 |
| - # TODO(tdoublep): remove as full cuda graph support is added |
324 |
| - FCG_NOT_SUPPORTED_MODELS = [ |
325 |
| - "Lfm2ForCausalLM", |
326 |
| - "MiniMaxText01ForCausalLM", |
327 |
| - ] |
328 |
| - |
329 |
| - if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS |
330 |
| - and compilation_config.cudagraph_mode is None): |
331 |
| - logger.info( |
332 |
| - "Hybrid or mamba-based model detected: setting cudagraph mode " |
333 |
| - "to FULL_AND_PIECEWISE in order to optimize performance.") |
334 |
| - compilation_config.cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE |
335 |
| - |
336 | 321 |
|
337 | 322 | class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
|
338 | 323 |
|
|
0 commit comments