Skip to content

Commit 85a754a

Browse files
authored
Support hybrid attention architectures in LayerWrapper (#2367)
## Summary Allow `LayerWrapper` to handle models with hybrid layer types (e.g., Qwen3.5) where some decoder layers use linear attention instead of standard self-attention. ## Problem Qwen3.5 is a hybrid VL model with 24 decoder layers — 18 use GatedDeltaNet linear attention (`linear_attn` sub-module) and 6 use standard full attention (`self_attn`). When Olive's `SelectiveMixedPrecision` or `GPTQ` passes wrap each layer with `LayerWrapper`, the constructor calls: ```python self.attn, self.attn_name = get_submodules( layer, self.ATTENTION, self.model_type, return_name=True ) ``` This raises ```ValueError```for GatedDeltaNet layers since they don't have a ```self_attn``` attribute. Fix Pass ```fail_on_not_found=False``` to the attention sub-module lookup in ```LayerWrapper.__init__```: ``` - self.attn, self.attn_name = get_submodules( - layer, self.ATTENTION, self.model_type, return_name=True - ) + # Use fail_on_not_found=False to support hybrid architectures (e.g., Qwen3.5) + # where some layers use linear attention instead of standard self-attention + self.attn, self.attn_name = get_submodules( + layer, self.ATTENTION, self.model_type, return_name=True, fail_on_not_found=False + ) ``` When a layer doesn't have a standard attention module, ```self.attn``` is set to None and the calibration passes gracefully skip attention-specific quantization for that layer while still processing the MLP.
1 parent 1bfa214 commit 85a754a

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

olive/common/hf/wrapper.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,11 @@ def __init__(self, layer: nn.Module, model_type: str):
134134
self.layer = layer
135135
self.model_type = model_type
136136

137-
self.attn, self.attn_name = get_submodules(layer, self.ATTENTION, self.model_type, return_name=True)
137+
# Use fail_on_not_found=False to support hybrid architectures (e.g., Qwen3.5)
138+
# where some layers use linear attention instead of standard self-attention
139+
self.attn, self.attn_name = get_submodules(
140+
layer, self.ATTENTION, self.model_type, return_name=True, fail_on_not_found=False
141+
)
138142
self.mlp, self.mlp_name = get_submodules(layer, self.MLP, self.model_type, return_name=True)
139143

140144
def get_first_layer_norm(self, return_name: bool = True):
@@ -144,6 +148,8 @@ def get_second_layer_norm(self, return_name: bool = True):
144148
return get_submodules(self.layer, self.SECOND_LAYER_NORM, self.model_type, return_name=return_name)
145149

146150
def get_attention_inputs(self, return_name: bool = True):
151+
if self.attn is None:
152+
return ([], []) if return_name else []
147153
attention_inputs, names = get_submodules(
148154
self.attn, self.ATTENTION_INPUTS, self.model_type, return_name=True, return_name_prefix=f"{self.attn_name}."
149155
)
@@ -153,6 +159,8 @@ def get_attention_inputs(self, return_name: bool = True):
153159
return attention_inputs if not return_name else (attention_inputs, names)
154160

155161
def get_attention_outputs(self, return_name: bool = True):
162+
if self.attn is None:
163+
return ([], []) if return_name else []
156164
return get_submodules(
157165
self.attn,
158166
self.ATTENTION_OUTPUTS,
@@ -274,6 +282,8 @@ def maybe_untie_word_embeddings(self):
274282
def maybe_unpack_qkv(self):
275283
"""Unpack the QKV projection matrix into separate projections for models like phi3."""
276284
for layer_wrapper in self.get_layer_wrappers():
285+
if layer_wrapper.attn is None:
286+
continue
277287
attn_inputs, attn_input_names = layer_wrapper.get_attention_inputs()
278288

279289
if len(attn_inputs) != 1 or not isinstance(attn_inputs[0], nn.Linear):

0 commit comments

Comments
 (0)