fix: handle LFM2/Mamba hybrid layers in _get_vllm_state_dict for fast_inference

devchilll · devchilll · commit fe9a2a69da10 · 2026-02-17T17:53:02.000-08:00
Fix UnboundLocalError when using fast_inference=True with LFM2/Mamba
hybrid models (e.g. LiquidAI/LFM2.5-1.2B-Thinking). The crash occurred
because _get_vllm_state_dict only handled self_attn and cross_attn layer
types, leaving the prefix variable unset for short conv layers.

Changes:
- Add short_conv branch to extract conv in_proj, out_proj, and conv weights
- Move o_proj extraction inside attention branches (LFM2 uses out_proj)
- Add feed_forward.w1/w2/w3 MLP handling alongside standard mlp path
- Handle embedding_norm (LFM2) in addition to norm for final model norm
- Add LFM2 layer templates to get_model_layer_config
- Handle embedding_norm in set_additional_modules for HF reconstruction

Fixes #4073
diff --git a/unsloth_zoo/empty_model.py b/unsloth_zoo/empty_model.py
@@ -442,10 +442,17 @@ def set_embedding(module, embed_tokens_key, pad_token_id, requires_grad=False):
         set_embedding(new_model.model.visual.pos_embed, 'model.visual.pos_embed.weight', None, requires_grad=False)
 
     # Norm
+    # LFM2 uses embedding_norm instead of norm
     norm_key = f"{language_model_prefix}.norm.weight"
-    norm = quant_state_dict[norm_key]
-    norm = torch.nn.Parameter(norm, requires_grad = False)
-    language_model.norm.weight = norm
+    embedding_norm_key = f"{language_model_prefix}.embedding_norm.weight"
+    if norm_key in quant_state_dict:
+        norm = quant_state_dict[norm_key]
+        norm = torch.nn.Parameter(norm, requires_grad = False)
+        language_model.norm.weight = norm
+    elif embedding_norm_key in quant_state_dict:
+        norm = quant_state_dict[embedding_norm_key]
+        norm = torch.nn.Parameter(norm, requires_grad = False)
+        language_model.embedding_norm.weight = norm
 
     # LM Head. Do note that for some models, like Mistral3ForConditionalGeneration,
     # there can be mismatch in the value of tie_word_embeddings between config and text_config
@@ -539,6 +546,17 @@ def get_model_layer_config(return_non_layered=True):
             "model.layers.{kk}.mlp.up_proj",
             "model.layers.{kk}.mlp.gate_up_proj", # for extracting from vLLM (phi3 architecture)
             "model.layers.{kk}.mlp.down_proj",
+
+            # LFM2 hybrid model layers (attention + short convolution)
+            "model.layers.{kk}.self_attn.out_proj",  # LFM2 attention uses out_proj instead of o_proj
+            "model.layers.{kk}.self_attn.q_layernorm",
+            "model.layers.{kk}.self_attn.k_layernorm",
+            "model.layers.{kk}.conv.in_proj",
+            "model.layers.{kk}.conv.out_proj",
+            "model.layers.{kk}.conv.conv",
+            "model.layers.{kk}.feed_forward.w1",
+            "model.layers.{kk}.feed_forward.w3",
+            "model.layers.{kk}.feed_forward.w2",
         },
         'layernorms': {
             "model.language_model.layers.{kk}.input_layernorm",
@@ -555,6 +573,13 @@ def get_model_layer_config(return_non_layered=True):
             "model.layers.{kk}.post_feedforward_layernorm",
             "model.layers.{kk}.self_attn.q_norm",
             "model.layers.{kk}.self_attn.k_norm",
+
+            # LFM2 hybrid model norms
+            "model.layers.{kk}.operator_norm",       # pre-block norm (replaces input_layernorm)
+            "model.layers.{kk}.ffn_norm",             # post-block norm (replaces post_attention_layernorm)
+            "model.layers.{kk}.self_attn.q_layernorm",  # QK norm inside attention
+            "model.layers.{kk}.self_attn.k_layernorm",  # QK norm inside attention
+
             "model.visual.blocks.{kk}.norm1",
             "model.visual.blocks.{kk}.norm2",
             "model.vision_tower.vision_model.encoder.layers.{kk}.post_layernorm",
diff --git a/unsloth_zoo/vllm_utils.py b/unsloth_zoo/vllm_utils.py
@@ -1095,7 +1095,6 @@ def _is_fused_module(name: str) -> bool:
         if hasattr(layer, "self_attn"):
             prefix = f"{vllm_text_model_prefix}.layers.{kk}.self_attn"
             qkv_proj = layer.self_attn.qkv_proj
-            o_proj = layer.self_attn.o_proj
 
             use_fused_qkv = _is_fused_module("qkv_proj")
             if use_fused_qkv:
@@ -1107,6 +1106,16 @@ def _is_fused_module(name: str) -> bool:
                 get_state_dict(f"{prefix}.q_proj", 0, state_dict, qkv_proj)
                 get_state_dict(f"{prefix}.k_proj", 1, state_dict, qkv_proj)
                 get_state_dict(f"{prefix}.v_proj", 2, state_dict, qkv_proj)
+
+            # Extract o_proj or out_proj depending on model architecture
+            # LFM2 uses out_proj, most other models use o_proj
+            if hasattr(layer.self_attn, "o_proj"):
+                o_proj = layer.self_attn.o_proj
+                get_state_dict(f"{prefix}.o_proj", 0, state_dict, o_proj)
+            elif hasattr(layer.self_attn, "out_proj"):
+                out_proj = layer.self_attn.out_proj
+                get_state_dict(f"{prefix}.out_proj", 0, state_dict, out_proj)
+
         elif hasattr(layer, "cross_attn"):
             prefix = f"{vllm_text_model_prefix}.layers.{kk}.cross_attn"
             qkv_proj = layer.cross_attn.qkv_proj
@@ -1118,22 +1127,72 @@ def _is_fused_module(name: str) -> bool:
             get_state_dict(f"{prefix}.q_proj", 0, state_dict, q_proj)
             get_state_dict(f"{prefix}.k_proj", 1, state_dict, kv_proj)
             get_state_dict(f"{prefix}.v_proj", 2, state_dict, kv_proj)
+            get_state_dict(f"{prefix}.o_proj", 0, state_dict, o_proj)
+
+        elif hasattr(layer, "short_conv"):
+            # LFM2 hybrid short convolution layers (non-attention layers)
+            # vLLM ShortConv: in_proj (MergedColumnParallelLinear, splits into B, C, x),
+            #                 out_proj (RowParallelLinear),
+            #                 conv (ColumnParallelLinear, weight unsqueezed for conv1d)
+            # HF Lfm2ShortConv: in_proj (nn.Linear, hidden -> 3*hidden),
+            #                   out_proj (nn.Linear, hidden -> hidden),
+            #                   conv (nn.Conv1d, depthwise with groups=hidden_size)
+            conv_prefix = f"{vllm_text_model_prefix}.layers.{kk}.conv"
+            short_conv = layer.short_conv
+
+            # in_proj: vLLM splits into 3 shards via MergedColumnParallelLinear,
+            # but HF stores as a single nn.Linear(hidden, 3*hidden)
+            get_state_dict(f"{conv_prefix}.in_proj", 0, state_dict, short_conv.in_proj, slice_weights=False)
+
+            # out_proj: direct mapping
+            get_state_dict(f"{conv_prefix}.out_proj", 0, state_dict, short_conv.out_proj)
+
+            # conv: vLLM stores as ColumnParallelLinear with weight shape (out, 1, kernel)
+            # HF expects nn.Conv1d weight with same shape (hidden, 1, kernel) for depthwise conv
+            conv_module = short_conv.conv
+            conv_weight = getattr(conv_module, "base_layer", conv_module).weight
+            conv_weight.requires_grad_(False)
+            state_dict[f"{conv_prefix}.conv.weight"] = conv_weight
+            quant_state_dict[f"{conv_prefix}.conv.weight"] = conv_weight
+            # Handle conv bias if present
+            conv_bias = getattr(conv_module, "bias", None)
+            if conv_bias is None:
+                conv_bias = getattr(getattr(conv_module, "base_layer", conv_module), "bias", None)
+            if conv_bias is not None:
+                conv_bias.requires_grad_(False)
+                state_dict[f"{conv_prefix}.conv.bias"] = conv_bias
+                quant_state_dict[f"{conv_prefix}.conv.bias"] = conv_bias
+        pass
 
-        get_state_dict(f"{prefix}.o_proj", 0, state_dict, o_proj)
-
-        proj = layer.mlp.gate_up_proj
-        use_fused_gate_up = _is_fused_module("gate_up_proj")
-        if use_fused_gate_up:
-            # For some model types like phi3 vllm will expect fused gate_up_proj (e.g. Phi3, Phi3.5-mini-instruct, Phi4-mini-instruct)
-            # so we should not split them here otherwise there will be a size mismatch when activating the adapter
-            # see https://github.com/vllm-project/vllm/blob/9b693d023cf595e60b5346fdeeb41cf2a6eda838/vllm/model_executor/models/phi3.py
-            get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.gate_up_proj", 0, state_dict, proj, slice_weights=False)
-        else:
-            get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.gate_proj", 0, state_dict, proj)
-            get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.up_proj",   1, state_dict, proj)
-
-        proj = layer.mlp.down_proj
-        get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.down_proj", 0, state_dict, proj)
+        # MLP / Feed Forward extraction
+        # LFM2 uses feed_forward with w1 (gate), w2 (down), w3 (up) — SwiGLU style
+        # Standard models use mlp with gate_up_proj (merged), down_proj
+        if hasattr(layer, "mlp"):
+            proj = layer.mlp.gate_up_proj
+            use_fused_gate_up = _is_fused_module("gate_up_proj")
+            if use_fused_gate_up:
+                # For some model types like phi3 vllm will expect fused gate_up_proj (e.g. Phi3, Phi3.5-mini-instruct, Phi4-mini-instruct)
+                # so we should not split them here otherwise there will be a size mismatch when activating the adapter
+                # see https://github.com/vllm-project/vllm/blob/9b693d023cf595e60b5346fdeeb41cf2a6eda838/vllm/model_executor/models/phi3.py
+                get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.gate_up_proj", 0, state_dict, proj, slice_weights=False)
+            else:
+                get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.gate_proj", 0, state_dict, proj)
+                get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.up_proj",   1, state_dict, proj)
+
+            proj = layer.mlp.down_proj
+            get_state_dict(f"{vllm_text_model_prefix}.layers.{kk}.mlp.down_proj", 0, state_dict, proj)
+
+        elif hasattr(layer, "feed_forward"):
+            # LFM2 uses feed_forward with w1 (gate), w3 (up), w2 (down)
+            # In vLLM, w1 and w3 are merged into a single MergedColumnParallelLinear
+            ff_prefix = f"{vllm_text_model_prefix}.layers.{kk}.feed_forward"
+            w1_proj = layer.feed_forward.w1  # MergedColumnParallelLinear (w1 at index 0, w3 at index 1)
+            get_state_dict(f"{ff_prefix}.w1", 0, state_dict, w1_proj)
+            get_state_dict(f"{ff_prefix}.w3", 1, state_dict, w1_proj)
+
+            w2_proj = layer.feed_forward.w2  # RowParallelLinear
+            get_state_dict(f"{ff_prefix}.w2", 0, state_dict, w2_proj)
+        pass
 
         # Use layernorms from the layer configuration
         layernorm_names = [name.format(kk=kk) for name in layer_config['layernorms']]
@@ -1160,9 +1219,15 @@ def _is_fused_module(name: str) -> bool:
     # Norm
     # For Gemma3 and similar multimodal models, norm should be under model.norm
     # For standard models, also under model.norm
-    norm_prefix = f"{vllm_text_model_prefix}.norm.weight"
-    state_dict[norm_prefix] = vllm_text_model.norm.weight.data
-    quant_state_dict[norm_prefix] = state_dict[norm_prefix]
+    # LFM2 uses embedding_norm instead of norm
+    if hasattr(vllm_text_model, "norm"):
+        norm_prefix = f"{vllm_text_model_prefix}.norm.weight"
+        state_dict[norm_prefix] = vllm_text_model.norm.weight.data
+        quant_state_dict[norm_prefix] = state_dict[norm_prefix]
+    elif hasattr(vllm_text_model, "embedding_norm"):
+        norm_prefix = f"{vllm_text_model_prefix}.embedding_norm.weight"
+        state_dict[norm_prefix] = vllm_text_model.embedding_norm.weight.data
+        quant_state_dict[norm_prefix] = state_dict[norm_prefix]
 
     # LM Head - Use get_state_dict for consistency
     if not getattr(text_config, "tie_word_embeddings", False):