diff --git a/.gitignore b/.gitignore index 9c6801c43..bd54a91c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc /.vs +.vscode /build CMake*.json diff --git a/include/ctranslate2/layers/attention_layer.h b/include/ctranslate2/layers/attention_layer.h index e55ecc5de..941a04913 100644 --- a/include/ctranslate2/layers/attention_layer.h +++ b/include/ctranslate2/layers/attention_layer.h @@ -52,6 +52,7 @@ namespace ctranslate2 { const bool multi_query = false); protected: + bool _is_low_rank; const bool _tensor_parallel; const dim_t _num_heads; const bool _self_attention; diff --git a/include/ctranslate2/layers/common.h b/include/ctranslate2/layers/common.h index 137b926d3..b6d8a29bd 100644 --- a/include/ctranslate2/layers/common.h +++ b/include/ctranslate2/layers/common.h @@ -135,7 +135,9 @@ namespace ctranslate2 { void select_weights(const StorageView* index, const StorageView* extra_bias = nullptr); private: bool _packed_weight; + bool _is_low_rank; const StorageView& _weight; + const StorageView* _weight2; const StorageView* _bias; const StorageView* _qscale; const StorageView* _qzero; diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2684dd2c7..9c7095d7b 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -3,6 +3,7 @@ import gc import itertools import os +import re from typing import List, Optional @@ -96,6 +97,13 @@ def __init__( trust_remote_code: Allow converting models using custom code. """ self._model_name_or_path = model_name_or_path + self._model_processor_name = model_name_or_path + if model_name_or_path.startswith('efficient-speech/lite-whisper'): + # If this is a lite-whisper model, use openai's + # corresponding preprocessor. + regex = r'whisper-[a-z0-9-]+?(?=-(?:fast|acc)|$)' + regex_result = re.search(regex, model_name_or_path) + self._model_processor_name = f"openai/{regex_result.group()}" self._activation_scales = activation_scales self._copy_files = copy_files self._load_as_float16 = load_as_float16 @@ -119,7 +127,6 @@ def _load(self): % (config_name, ", ".join(sorted(_MODEL_LOADERS.keys()))) ) - model_class = getattr(transformers, loader.architecture_name) tokenizer_class = transformers.AutoTokenizer kwargs = { @@ -137,14 +144,21 @@ def _load(self): if self._trust_remote_code: kwargs["trust_remote_code"] = self._trust_remote_code - model = self.load_model(model_class, self._model_name_or_path, **kwargs) + if hasattr(transformers, loader.architecture_name): + model_class = getattr(transformers, loader.architecture_name) + model = self.load_model(model_class, self._model_name_or_path, **kwargs) + elif self._model_name_or_path.startswith('efficient-speech/lite-whisper'): + model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs) + else: + raise ValueError( + "The model %s is not supported by the converter. " % self._model_name_or_path) tokenizer_kwargs = {} if self._trust_remote_code: tokenizer_kwargs["trust_remote_code"] = self._trust_remote_code tokenizer = self.load_tokenizer( - tokenizer_class, self._model_name_or_path, **tokenizer_kwargs + tokenizer_class, self._model_processor_name, **tokenizer_kwargs ) spec = loader(model, tokenizer) @@ -237,6 +251,19 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): spec.weight = spec.weight.transpose(0, 1) if module.bias is not None: spec.bias = module.bias + + def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): + if quant_type == common_spec.Quantization.CT2: + spec.low_rank_weight_1 = module.weight1 + spec.low_rank_weight_2 = module.weight2 + else: + spec.low_rank_weight_1 = module.qweight1 + spec.low_rank_weight_2 = module.qweight2 + spec.weight_scale = module.scales + spec.weight_zero = module.qzeros + + if module.bias is not None: + spec.bias = module.bias def set_embeddings(self, spec, module): spec.weight = module.weight @@ -996,6 +1023,71 @@ def set_conv1d(self, spec, module): spec.weight = module.weight spec.bias = module.bias +@register_loader("LiteWhisperConfig") +class LiteWhisperLoader(WhisperLoader): + @property + def architecture_name(self): + return "LiteWhisperForConditionalGeneration" + + def get_model_spec(self, model): + spec = whisper_spec.WhisperSpec( + model.config.encoder_layers, + model.config.encoder_attention_heads, + model.config.decoder_layers, + model.config.decoder_attention_heads, + low_rank=True, + ) + + self.set_encoder(spec.encoder, model.model.encoder) + self.set_decoder(spec.decoder, model.model.decoder) + self.set_linear(spec.decoder.projection, model.proj_out) + + return spec + + def set_encoder(self, spec, encoder): + self.set_conv1d(spec.conv1, encoder.conv1) + self.set_conv1d(spec.conv2, encoder.conv2) + + self.set_common_layers(spec, encoder) + + for layer_spec, layer in zip(spec.layer, encoder.layers): + self.set_low_rank_attention( + layer_spec.self_attention, + layer.self_attn, + ) + self.set_layer_norm( + layer_spec.self_attention.layer_norm, + layer.self_attn_layer_norm, + ) + + # Double check if these are low rank or not because of potential + # fall backs to full precision. + if hasattr(layer.fc1, 'weight1'): + self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) + else: + layer_spec.ffn.linear_0 = common_spec.LinearSpec() + self.set_linear(layer_spec.ffn.linear_0, layer.fc1) + + if hasattr(layer.fc2, 'weight1'): + self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) + else: + layer_spec.ffn.linear_1 = common_spec.LinearSpec() + self.set_linear(layer_spec.ffn.linear_1, layer.fc2) + + self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) + + def set_low_rank_or_linear_router(self, spec, module, i): + if hasattr(module, "weight1"): + self.set_low_rank_linear(spec.linear[i], module) + else: + spec.linear[i] = common_spec.LinearSpec() + self.set_linear(spec.linear[i], module) + + def set_low_rank_attention(self, spec, attention): + self.set_low_rank_or_linear_router(spec, attention.q_proj, 0) + self.set_low_rank_or_linear_router(spec, attention.k_proj, 1) + self.set_low_rank_or_linear_router(spec, attention.v_proj, 2) + self.set_low_rank_or_linear_router(spec, attention.out_proj, 3) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py index f49d41121..1e90c2246 100644 --- a/python/ctranslate2/specs/attention_spec.py +++ b/python/ctranslate2/specs/attention_spec.py @@ -32,13 +32,17 @@ def __init__( num_heads_kv=None, head_dim=None, sliding_window=None, + low_rank=False, ): self.queries_scale = model_spec.OPTIONAL self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - self.linear = [ - common_spec.LinearSpec() for _ in range(2 if self_attention else 3) - ] + if low_rank: + self.linear = [common_spec.LowRankLinearSpec() for _ in range(4)] + else: + self.linear = [ + common_spec.LinearSpec() for _ in range(2 if self_attention else 3) + ] if relative_position: self.relative_position_keys = None diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py index 598a452d6..4233d5fd7 100644 --- a/python/ctranslate2/specs/common_spec.py +++ b/python/ctranslate2/specs/common_spec.py @@ -51,6 +51,18 @@ def __init__(self): def has_bias(self): return not isinstance(self.bias, str) +class LowRankLinearSpec(model_spec.LayerSpec): + def __init__(self): + super().__init__() + self.low_rank_weight_1 = None + self.low_rank_weight_2 = None + self.weight_scale = model_spec.OPTIONAL + self.weight_zero = model_spec.OPTIONAL + self.bias = model_spec.OPTIONAL + + def has_bias(self): + return not isinstance(self.bias, str) + class Conv1DSpec(model_spec.LayerSpec): def __init__(self): diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py index 230e62cfd..4be7e9466 100644 --- a/python/ctranslate2/specs/transformer_spec.py +++ b/python/ctranslate2/specs/transformer_spec.py @@ -253,6 +253,7 @@ def __init__( rms_norm=False, num_heads_kv=None, sliding_window=None, + low_rank=False ): self.self_attention = attention_spec.MultiHeadAttentionSpec( self_attention=True, @@ -261,8 +262,9 @@ def __init__( rms_norm=rms_norm, num_heads_kv=num_heads_kv, sliding_window=sliding_window, + low_rank=low_rank, ) - self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm) + self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm, low_rank=low_rank) class TransformerDecoderLayerSpec(model_spec.LayerSpec): @@ -340,10 +342,10 @@ def __init__( class FeedForwardSpec(model_spec.LayerSpec): - def __init__(self, glu=False, rms_norm=False): + def __init__(self, glu=False, rms_norm=False, low_rank=False): self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - self.linear_0 = common_spec.LinearSpec() - self.linear_1 = common_spec.LinearSpec() + self.linear_0 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() + self.linear_1 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() if glu: self.linear_0_noact = common_spec.LinearSpec() diff --git a/python/ctranslate2/specs/whisper_spec.py b/python/ctranslate2/specs/whisper_spec.py index e32453e1c..a04909d3e 100644 --- a/python/ctranslate2/specs/whisper_spec.py +++ b/python/ctranslate2/specs/whisper_spec.py @@ -32,6 +32,7 @@ def __init__( num_encoder_heads, num_decoder_layers, num_decoder_heads, + low_rank=False, ): """Initializes the model specification. @@ -42,7 +43,7 @@ def __init__( num_decoder_heads: The number of decoder attention heads. """ super().__init__() - self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads) + self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank) self.decoder = transformer_spec.TransformerDecoderSpec( num_decoder_layers, num_decoder_heads, @@ -66,12 +67,12 @@ def get_vocabulary_size(self): class WhisperEncoderSpec(model_spec.LayerSpec): - def __init__(self, num_layers, num_heads): + def __init__(self, num_layers, num_heads, low_rank=False): self.num_heads = np.dtype("int16").type(num_heads) self.conv1 = common_spec.Conv1DSpec() self.conv2 = common_spec.Conv1DSpec() self.position_encodings = transformer_spec.PositionEncoderSpec() self.layer_norm = common_spec.LayerNormSpec() self.layer = [ - transformer_spec.TransformerEncoderLayerSpec() for _ in range(num_layers) + transformer_spec.TransformerEncoderLayerSpec(low_rank=low_rank) for _ in range(num_layers) ] diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 6ad344410..526d4742b 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -360,13 +360,22 @@ namespace ctranslate2 { q = &queries_proj; } - _linear[0](*q, fused_proj); + if (!_is_low_rank) { + _linear[0](*q, fused_proj); + } else { + // Low-rank attention does not fuse qkv. + _linear[0](*q, queries_proj); + _linear[1](*q, keys_proj); + _linear[2](*q, values_proj); + } dim_t beam_size = 1; bool prefilling = (_sliding_window > 0 && values_lengths); if (!_self_attention) { + if (_is_low_rank) + throw std::invalid_argument("MultiHeadAttention does not support low-rank attention with cross-attention"); queries_proj = std::move(fused_proj); if (cached_keys == nullptr || cached_keys->empty()) { @@ -401,6 +410,8 @@ namespace ctranslate2 { } else { if (_num_heads_kv < _num_heads) { + if (_is_low_rank) + throw std::invalid_argument("MutliHeadAttention does not support low-rank attention with multi-query or GQA"); if (queries_padder) queries_padder->add_padding(fused_proj); @@ -419,8 +430,14 @@ namespace ctranslate2 { } } else { - split_heads(fused_proj, 3 * _num_heads, queries_padder); - ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); + if (!_is_low_rank) { + split_heads(fused_proj, 3 * _num_heads, queries_padder); + ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); + } else { + split_heads(queries_proj, _num_heads, queries_padder); + split_heads(keys_proj, _num_heads_kv, queries_padder); + split_heads(values_proj, _num_heads_kv, queries_padder); + } } if (_rotary_embeddings) { diff --git a/src/layers/attention_layer.cc b/src/layers/attention_layer.cc index c9ae67409..38064656e 100644 --- a/src/layers/attention_layer.cc +++ b/src/layers/attention_layer.cc @@ -51,10 +51,25 @@ namespace ctranslate2 { return alibi; } + static bool set_low_rank(const models::Model& model, const std::string& scope) { + const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/linear_0/low_rank_weight_1"); + if (low_rank_weight) { + return true; + } + return false; + } + static std::vector make_linear_layers(const models::Model& model, const std::string& scope, - bool self_attention) { - const dim_t num_linear_layers = self_attention ? 2 : 3; + bool self_attention, + bool _is_low_rank) { + dim_t num_linear_layers; + if (!_is_low_rank) { + num_linear_layers = self_attention ? 2 : 3; + } else { + num_linear_layers = 4; + } + std::vector layers; layers.reserve(num_linear_layers); for (dim_t i = 0; i < num_linear_layers; ++i) @@ -117,11 +132,12 @@ namespace ctranslate2 { bool is_decoder, Alibi* alibi, bool is_flash_attn) - : _tensor_parallel(model.tensor_parallel()) + : _is_low_rank(set_low_rank(model, scope)) + , _tensor_parallel(model.tensor_parallel()) , _num_heads(_tensor_parallel ? SAFE_DIVIDE(num_heads, ScopedMPISetter::getNRanks()) : num_heads) , _self_attention(self_attention) , _is_decoder(is_decoder) - , _linear(make_linear_layers(model, scope, self_attention)) + , _linear(make_linear_layers(model, scope, self_attention, _is_low_rank)) , _d_model(_tensor_parallel ? SAFE_DIVIDE(_linear.back().output_size(), ScopedMPISetter::getNRanks()) : _linear.back().output_size()) , _d_head(model.get_attribute_with_default(scope + "/head_dim", _d_model / _num_heads)) , _pre_norm(pre_norm) diff --git a/src/layers/common.cc b/src/layers/common.cc index c6d1cd0b5..9b8d7c3d9 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -250,6 +250,13 @@ namespace ctranslate2 { return _encoding.dim(1); } + static bool set_low_rank(const models::Model& model, const std::string& scope) { + const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/low_rank_weight_1"); + if (low_rank_weight) { + return true; + } + return false; + } static const StorageView& get_linear_weight(const models::Model& model, const std::string& scope, @@ -268,7 +275,9 @@ namespace ctranslate2 { const ops::ActivationType* activation_type, const bool is_layer_out) : _packed_weight(false) - , _weight(get_linear_weight(model, scope, &_packed_weight)) + , _is_low_rank(set_low_rank(model, scope)) + , _weight(_is_low_rank ? *model.get_variable_if_exists(scope + "/low_rank_weight_1") : get_linear_weight(model, scope, &_packed_weight)) + , _weight2(_is_low_rank ? model.get_variable_if_exists(scope + "/low_rank_weight_2") : nullptr) , _bias(model.get_variable_if_exists(scope + "/bias")) , _qscale(model.get_variable_if_exists(scope + "/weight_scale")) , _qzero(model.get_variable_if_exists(scope + "/weight_zero")) @@ -287,7 +296,7 @@ namespace ctranslate2 { , _gemm_op(/*alpha=*/1, /*beta=*/0, /*trans_a=*/false, - /*trans_b=*/true, + /*trans_b=*/ _is_low_rank ? false : true, /*a_is_packed=*/false, _packed_weight, _quantized_gemm ? nullptr : activation_type) @@ -307,6 +316,12 @@ namespace ctranslate2 { } dim_t Dense::output_size() const { + if (_is_low_rank) { + if (_partial_weight) + throw std::runtime_error("Low rank dense layer does not support partial weights"); + // weight is transposed when low_rank + return _weight2->dim(1); + } return _partial_weight ? _partial_weight.dim(0) : _weight.dim(0); } @@ -338,8 +353,11 @@ namespace ctranslate2 { void Dense::operator()(const StorageView& input, StorageView& output) const { PROFILE("Dense"); + if (_is_low_rank && !_partial_weight.empty()) + throw std::runtime_error("Low rank dense layer does not support partial weights"); const StorageView* qscale = _partial_qscale.empty() ? _qscale : &_partial_qscale; const StorageView* weight = _partial_weight.empty() ? &_weight : &_partial_weight; + const StorageView* weight2 = _is_low_rank ? _weight2 : nullptr; const StorageView* bias = _partial_bias.empty() ? _bias : &_partial_bias; const StorageView* compensation = (_partial_u8_shift_compensation.empty() ? _u8_shift_compensation @@ -349,6 +367,8 @@ namespace ctranslate2 { if (affected_by_tp && ScopedMPISetter::getCurRank() != 0) bias = nullptr; if (_quantized_gemm) { + if (_is_low_rank) + throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); const auto device = input.device(); StorageView qinput(_weight.dtype(), device); StorageView qinput_scale(_qscale->dtype(), device); @@ -396,6 +416,8 @@ namespace ctranslate2 { output, bias); } else if (_qzero && _qscale) { + if (_is_low_rank) + throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); switch (_quant_method) { case models::QUANTIZATION_TYPE::AWQ_GEMM: if (input.dim(0) * input.dim(1) >= 1024) { @@ -428,7 +450,13 @@ namespace ctranslate2 { "support only ct2 and awq quantization"); } } else { - _gemm_op(input, *weight, output, nullptr, bias); + if (!_is_low_rank) { + _gemm_op(input, *weight, output, nullptr, bias); + } else { + StorageView& intermediate_output = output; + _gemm_op(input, *weight, intermediate_output, nullptr); + _gemm_op(intermediate_output, *weight2, output, nullptr, bias); + } } }