fix: issues with lite-whisper models

kamahori · kamahori · commit 132258ece4a6 · 2025-08-19T03:06:49.000Z
diff --git a/include/ctranslate2/layers/common.h b/include/ctranslate2/layers/common.h
@@ -150,6 +150,7 @@ namespace ctranslate2 {
       const models::QUANTIZATION_TYPE _quant_method;
       const bool _quantized_gemm;
       const ops::Gemm _gemm_op;
+      const ops::Gemm _gemm_op_low_rank;
       const ops::Quantize _quantize_op;
       const ops::Dequantize _dequantize_op;
       const ops::ActivationType* _activation_type;
diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py
@@ -3,7 +3,6 @@
 import gc
 import itertools
 import os
-import re
 
 from typing import List, Optional
 
@@ -97,13 +96,6 @@ def __init__(
           trust_remote_code: Allow converting models using custom code.
         """
         self._model_name_or_path = model_name_or_path
-        self._model_processor_name = model_name_or_path
-        if model_name_or_path.startswith('efficient-speech/lite-whisper'):
-            # If this is a lite-whisper model, use openai's 
-            # corresponding preprocessor.
-            regex = r'whisper-[a-z0-9-]+?(?=-(?:fast|acc)|$)'
-            regex_result = re.search(regex, model_name_or_path)
-            self._model_processor_name = f"openai/{regex_result.group()}"
         self._activation_scales = activation_scales
         self._copy_files = copy_files
         self._load_as_float16 = load_as_float16
@@ -127,6 +119,14 @@ def _load(self):
                     % (config_name, ", ".join(sorted(_MODEL_LOADERS.keys())))
                 )
 
+            # If lite whisper use corresponding openai tokenizer
+            if config.model_type == "lite-whisper":
+                base_name = self._model_name_or_path.split("/")[-1]  # e.g., "lite-whisper-large-v3"
+                base_name = base_name.replace("lite-", "")           # e.g., "whisper-large-v3"
+                tokenizer_path = f"openai/{base_name}"
+            else:
+                tokenizer_path = self._model_name_or_path
+
             tokenizer_class = transformers.AutoTokenizer
 
             kwargs = {
@@ -147,18 +147,15 @@ def _load(self):
             if hasattr(transformers, loader.architecture_name):
                 model_class = getattr(transformers, loader.architecture_name)
                 model = self.load_model(model_class, self._model_name_or_path, **kwargs)
-            elif self._model_name_or_path.startswith('efficient-speech/lite-whisper'):
-                model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs)
             else:
-                raise ValueError(
-                    "The model %s is not supported by the converter. " % self._model_name_or_path)
+                model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs)
 
             tokenizer_kwargs = {}
             if self._trust_remote_code:
                 tokenizer_kwargs["trust_remote_code"] = self._trust_remote_code
 
             tokenizer = self.load_tokenizer(
-                tokenizer_class, self._model_processor_name, **tokenizer_kwargs
+                tokenizer_class, tokenizer_path, **tokenizer_kwargs
             )
 
             spec = loader(model, tokenizer)
@@ -251,19 +248,6 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2):
             spec.weight = spec.weight.transpose(0, 1)
         if module.bias is not None:
             spec.bias = module.bias
-    
-    def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2):
-        if quant_type == common_spec.Quantization.CT2:
-            spec.low_rank_weight_1 = module.weight1
-            spec.low_rank_weight_2 = module.weight2
-        else:
-            spec.low_rank_weight_1 = module.qweight1
-            spec.low_rank_weight_2 = module.qweight2
-            spec.weight_scale = module.scales
-            spec.weight_zero = module.qzeros
-
-        if module.bias is not None:
-            spec.bias = module.bias
 
     def set_embeddings(self, spec, module):
         spec.weight = module.weight
@@ -1044,10 +1028,45 @@ def get_model_spec(self, model):
 
         return spec
 
+
+    def set_config(self, config, model, tokenizer):
+        gen_config = getattr(model, "generation_config", None)
+
+        if gen_config is not None:
+            config.suppress_ids = gen_config.suppress_tokens
+            config.suppress_ids_begin = gen_config.begin_suppress_tokens
+            if hasattr(gen_config, "alignment_heads"):
+                config.alignment_heads = gen_config.alignment_heads
+            if hasattr(gen_config, "lang_to_id"):
+                config.lang_ids = sorted(gen_config.lang_to_id.values())
+        else:
+            config.suppress_ids = model.config.suppress_tokens
+            config.suppress_ids_begin = model.config.begin_suppress_tokens
+            config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path)
+
+        if getattr(config, "lang_ids", None) is None:
+            config.lang_ids = self._get_lang_ids_from_tokenizer(tokenizer)
+
+        if config.alignment_heads is None:
+            config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path)
+            if config.alignment_heads is None:
+                    # Use the last half layers for alignment by default.
+                num_layers = model.config.decoder_layers
+                num_heads = model.config.decoder_attention_heads
+                config.alignment_heads = list(
+                    itertools.product(
+                    range(num_layers // 2, num_layers),
+                    range(num_heads),
+                )
+                )
+
     def set_encoder(self, spec, encoder):
+        """
+        Override encoder mapping for LiteWhisper.
+        """
         self.set_conv1d(spec.conv1, encoder.conv1)
         self.set_conv1d(spec.conv2, encoder.conv2)
-
+        
         self.set_common_layers(spec, encoder)
 
         for layer_spec, layer in zip(spec.layer, encoder.layers):
@@ -1060,29 +1079,42 @@ def set_encoder(self, spec, encoder):
                 layer.self_attn_layer_norm,
             )
 
-            # Double check if these are low rank or not because of potential
-            # fall backs to full precision.
-            if hasattr(layer.fc1, 'weight1'):
+            if hasattr(layer.fc1, "weight1"):
+                # low rank
                 self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1)
             else:
                 layer_spec.ffn.linear_0 = common_spec.LinearSpec()
                 self.set_linear(layer_spec.ffn.linear_0, layer.fc1)
-
-            if hasattr(layer.fc2, 'weight1'):
+            
+            if hasattr(layer.fc2, "weight1"):
+                # low rank
                 self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2)
             else:
                 layer_spec.ffn.linear_1 = common_spec.LinearSpec()
                 self.set_linear(layer_spec.ffn.linear_1, layer.fc2)
 
             self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm)
 
+    def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2):
+        if quant_type == common_spec.Quantization.CT2:
+            spec.low_rank_weight_1 = module.weight1.transpose(0, 1).contiguous()
+            spec.low_rank_weight_2 = module.weight2.transpose(0, 1).contiguous()
+        else:
+            spec.low_rank_weight_1 = module.qweight1.transpose(0, 1).contiguous()
+            spec.low_rank_weight_2 = module.qweight2.transpose(0, 1).contiguous()
+            spec.weight_scale = module.scales
+            spec.weight_zero = module.qzeros
+
+        if module.bias is not None:
+            spec.bias = module.bias
+
     def set_low_rank_or_linear_router(self, spec, module, i):
         if hasattr(module, "weight1"):
             self.set_low_rank_linear(spec.linear[i], module)
         else:
             spec.linear[i] = common_spec.LinearSpec()
             self.set_linear(spec.linear[i], module)
-    
+
     def set_low_rank_attention(self, spec, attention):
         self.set_low_rank_or_linear_router(spec, attention.q_proj, 0)
         self.set_low_rank_or_linear_router(spec, attention.k_proj, 1)
@@ -3000,6 +3032,7 @@ def main():
         (3, 4),
     ],
     "openai/whisper-tiny": [(2, 2), (3, 0), (3, 2), (3, 3), (3, 4), (3, 5)],
+    "efficient-speech/whisper-tiny": [(2, 2), (3, 0), (3, 2), (3, 3), (3, 4), (3, 5)],
     "openai/whisper-base.en": [(3, 3), (4, 7), (5, 1), (5, 5), (5, 7)],
     "openai/whisper-base": [
         (3, 1),
@@ -3113,4 +3146,16 @@ def main():
         (24, 1),
         (25, 6),
     ],
+    "efficient-speech/whisper-large-v3": [
+        (7, 0),
+        (10, 17),
+        (12, 18),
+        (13, 12),
+        (16, 1),
+        (17, 14),
+        (19, 11),
+        (21, 4),
+        (24, 1),
+        (25, 6),
+    ],
 }
diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py
@@ -37,12 +37,9 @@ def __init__(
         self.queries_scale = model_spec.OPTIONAL
 
         self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
-        if low_rank:
-            self.linear = [common_spec.LowRankLinearSpec() for _ in range(4)]
-        else:
-            self.linear = [
-                common_spec.LinearSpec() for _ in range(2 if self_attention else 3)
-            ]
+        linear_cls = common_spec.LinearLowRankSpec if low_rank else common_spec.LinearSpec
+        count = 4 if low_rank else (2 if self_attention else 3)
+        self.linear = [linear_cls() for _ in range(count)]
 
         if relative_position:
             self.relative_position_keys = None
diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py
@@ -51,18 +51,6 @@ def __init__(self):
     def has_bias(self):
         return not isinstance(self.bias, str)
 
-class LowRankLinearSpec(model_spec.LayerSpec):
-    def __init__(self):
-        super().__init__()
-        self.low_rank_weight_1 = None
-        self.low_rank_weight_2 = None
-        self.weight_scale = model_spec.OPTIONAL
-        self.weight_zero = model_spec.OPTIONAL
-        self.bias = model_spec.OPTIONAL
-
-    def has_bias(self):
-        return not isinstance(self.bias, str)
-
 
 class Conv1DSpec(model_spec.LayerSpec):
     def __init__(self):
@@ -76,3 +64,15 @@ def __init__(self):
         self.weight = None
         self.weight_scale = model_spec.OPTIONAL
         self.multiply_by_sqrt_depth = model_spec.OPTIONAL
+
+
+class LinearLowRankSpec(model_spec.LayerSpec):
+    def __init__(self):
+        self.low_rank_weight_1 = None
+        self.low_rank_weight_2 = None
+        self.weight_scale = model_spec.OPTIONAL
+        self.weight_zero = model_spec.OPTIONAL
+        self.bias = model_spec.OPTIONAL
+
+    def has_bias(self):
+        return not isinstance(self.bias, str)
diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py
@@ -253,7 +253,7 @@ def __init__(
         rms_norm=False,
         num_heads_kv=None,
         sliding_window=None,
-        low_rank=False
+        low_rank=False,
     ):
         self.self_attention = attention_spec.MultiHeadAttentionSpec(
             self_attention=True,
@@ -344,8 +344,9 @@ def __init__(
 class FeedForwardSpec(model_spec.LayerSpec):
     def __init__(self, glu=False, rms_norm=False, low_rank=False):
         self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
-        self.linear_0 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec()
-        self.linear_1 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec()
+        linear_cls = common_spec.LinearLowRankSpec if low_rank else common_spec.LinearSpec
+        self.linear_0 = linear_cls()
+        self.linear_1 = linear_cls()
         if glu:
             self.linear_0_noact = common_spec.LinearSpec()
 
diff --git a/python/ctranslate2/specs/whisper_spec.py b/python/ctranslate2/specs/whisper_spec.py
@@ -41,9 +41,10 @@ def __init__(
           num_encoder_heads: The number of encoder attention heads.
           num_decoder_layers: The number of decoder layers.
           num_decoder_heads: The number of decoder attention heads.
+          low_rank: Whether to use lite whisper model or not.
         """
         super().__init__()
-        self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank)
+        self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank=low_rank)
         self.decoder = transformer_spec.TransformerDecoderSpec(
             num_decoder_layers,
             num_decoder_heads,
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
@@ -360,13 +360,12 @@ namespace ctranslate2 {
         q = &queries_proj;
       }
 
-      if (!_is_low_rank) {
-        _linear[0](*q, fused_proj);
-      } else {
-        // Low-rank attention does not fuse qkv.
-        _linear[0](*q, queries_proj);
+      _linear[0](*q, fused_proj);
+
+      if (_is_low_rank) { // support low-rank
         _linear[1](*q, keys_proj);
         _linear[2](*q, values_proj);
+        queries_proj = std::move(fused_proj);
       }
 
       dim_t beam_size = 1;
@@ -375,7 +374,7 @@ namespace ctranslate2 {
 
       if (!_self_attention) {
         if (_is_low_rank)
-          throw std::invalid_argument("MultiHeadAttention does not support low-rank attention with cross-attention");
+          throw std::invalid_argument("lite whisper doesn't use low-rank for cross-attention");
         queries_proj = std::move(fused_proj);
 
         if (cached_keys == nullptr || cached_keys->empty()) {
@@ -411,7 +410,7 @@ namespace ctranslate2 {
 
         if (_num_heads_kv < _num_heads) {
           if (_is_low_rank)
-            throw std::invalid_argument("MutliHeadAttention does not support low-rank attention with multi-query or GQA");
+            throw std::invalid_argument("lite whisper doesn't use low-rank for multi-query or GQA");
           if (queries_padder)
             queries_padder->add_padding(fused_proj);
 
@@ -430,10 +429,11 @@ namespace ctranslate2 {
           }
 
         } else {
-          if (!_is_low_rank) {
+          if (!_is_low_rank){
             split_heads(fused_proj, 3 * _num_heads, queries_padder);
             ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj);
-          } else {
+          }
+          else{
             split_heads(queries_proj, _num_heads, queries_padder);
             split_heads(keys_proj, _num_heads_kv, queries_padder);
             split_heads(values_proj, _num_heads_kv, queries_padder);
diff --git a/src/layers/attention_layer.cc b/src/layers/attention_layer.cc
@@ -52,24 +52,24 @@ namespace ctranslate2 {
     }
 
     static bool set_low_rank(const models::Model& model, const std::string& scope) {
-      const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/linear_0/low_rank_weight_1");
-      if (low_rank_weight) {
-        return true;
+      const dim_t max_layers = 4;
+      for (int i = 0; i < max_layers; ++i) {
+        std::string prefix = scope + "/linear_" + std::to_string(i);
+        const StorageView* w1 = model.get_variable_if_exists(prefix + "/low_rank_weight_1");
+        const StorageView* w2 = model.get_variable_if_exists(prefix + "/low_rank_weight_2");
+        if (w1 && w2) {
+          return true;
+        }
       }
+      // If no low-rank pair is found, then it is not low-rank
       return false;
     }
 
     static std::vector<Dense> make_linear_layers(const models::Model& model,
                                                  const std::string& scope,
                                                  bool self_attention,
                                                  bool _is_low_rank) {
-      dim_t num_linear_layers;
-      if (!_is_low_rank) {
-        num_linear_layers = self_attention ? 2 : 3;
-      } else {
-        num_linear_layers = 4;
-      }
-
+      const dim_t num_linear_layers = !_is_low_rank ? (self_attention ? 2 : 3) : 4;
       std::vector<Dense> layers;
       layers.reserve(num_linear_layers);
       for (dim_t i = 0; i < num_linear_layers; ++i)
diff --git a/src/layers/common.cc b/src/layers/common.cc