Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

BlockSize = Literal[1, 8, 16, 32, 64, 128]
CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
MambaDType = Literal["auto", "float32"]
MambaDType = Literal["auto", "float32", "fp8", "fp8_e4m3"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]


Expand Down
14 changes: 12 additions & 2 deletions vllm/model_executor/layers/mamba/mamba_mixer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
sharded_weight_loader,
)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import direct_register_custom_op
from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata

Expand Down Expand Up @@ -465,7 +466,7 @@ def __init__(
compilation_config.static_forward_context[prefix] = self
# The tuple is (conv_state, ssm_state)
self.kv_cache = (torch.tensor([]), torch.tensor([]))

self.fp8_dtype = current_platform.fp8_dtype()
self.model_config = model_config
self.cache_config = cache_config
self.prefix = prefix
Expand Down Expand Up @@ -514,7 +515,10 @@ def forward_cuda(
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
# conv_state = (..., dim, width-1) yet contiguous along 'dim'
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
if self.cache_config.mamba_ssm_cache_dtype.startswith("fp8"):
ssm_state = self_kv_cache[1].view(self.fp8_dtype)
else:
ssm_state = self_kv_cache[1]
state_indices_tensor = attn_metadata.state_indices_tensor
has_initial_states_p = attn_metadata.has_initial_states_p
prep_initial_states = attn_metadata.prep_initial_states
Expand Down Expand Up @@ -689,6 +693,9 @@ def forward_cuda(
0,
)

# TODO: add fp8 dequantization logic here when loading
# ssm state. Should load scales tensors if available

# NOTE: final output is an in-place update of out tensor
varlen_states = mamba_chunk_scan_combined_varlen(
hidden_states_p.view(
Expand Down Expand Up @@ -791,6 +798,9 @@ def forward_cuda(
# tensor
ssm_state[state_indices_tensor_p] = varlen_states

# TODO: Add fp8 quantization logic here for storing back
# ssm state. Should also update scales if dynamic

# Process decode requests
if has_decode:
if prefix_caching_enabled:
Expand Down
2 changes: 2 additions & 0 deletions vllm/model_executor/layers/mamba/mamba_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def _mamba_state_dtype(
mamba_cache_dtype: MambaDType,
mamba_ssm_cache_dtype: MambaDType,
) -> tuple[torch.dtype, ...]:
if mamba_cache_dtype.startswith("fp8"):
raise ValueError("fp8 mamba conv state is not supported")
conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
if mamba_ssm_cache_dtype == "auto":
temporal_state_dtype = conv_state_dtype
Expand Down