diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82b005e84a..2a786f3fc6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6441,6 +6441,303 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("T5GemmaForConditionalGeneration") +class T5GemmaModel(TextModel): + model_arch = gguf.MODEL_ARCH.T5GEMMA + + def __init__(self, *args, **kwargs): + # Load hyperparameters first to modify them for super().__init__() + dir_model = args[0] if args else kwargs.get('dir_model') + if dir_model is None: + raise ValueError("dir_model is required") + # Type assertion after None check + dir_model = Path(dir_model) + + hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model) + encoder_config = hparams.get("encoder", {}) + # Add num_hidden_layers to hparams so super().__init__() can find it + hparams["num_hidden_layers"] = encoder_config.get("num_hidden_layers", 0) + kwargs["hparams"] = hparams + + # Now call super().__init__() with modified hparams + super().__init__(*args, **kwargs) + + # T5Gemma specific initialization + self.is_encoder_decoder = True + + # Dynamically get encoder and decoder configurations + decoder_config = self.hparams.get("decoder", {}) + + # Dynamically set encoder and decoder layer counts + self.encoder_block_count = encoder_config.get("num_hidden_layers", 0) + self.decoder_block_count = decoder_config.get("num_hidden_layers", 0) + + # Set block_count to encoder_block_count for tensor mapping + self.block_count = self.encoder_block_count + + # Initialize tensor mapping using encoder layer count + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.encoder_block_count) + + def set_vocab(self): + # T5Gemma uses BPE tokenizer - read directly from tokenizer.json + import json + + tokenizer_json_path = self.dir_model / "tokenizer.json" + if not tokenizer_json_path.exists(): + logger.warning("tokenizer.json not found, falling back to GPT2 method") + self._set_vocab_gpt2() + return + + try: + with open(tokenizer_json_path, 'r', encoding='utf-8') as f: + tokenizer_data = json.load(f) + + # Extract vocabulary from tokenizer.json + vocab = tokenizer_data.get("model", {}).get("vocab", {}) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + + # Create tokens and types lists + tokens = [] + toktypes = [] + + # Create reverse mapping from id to token + id_to_token = {v: k for k, v in vocab.items()} + + for i in range(vocab_size): + if i in id_to_token: + token = id_to_token[i] + tokens.append(token) + # Check if it's a special token + if token in ['', '', '', '', '', '']: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + else: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + + # Extract merges from tokenizer.json if available + merges = [] + if "merges" in tokenizer_data and tokenizer_data["merges"]: + merges = tokenizer_data["merges"] + logger.info(f"Found {len(merges)} merges in tokenizer.json") + elif "model" in tokenizer_data and "merges" in tokenizer_data["model"]: + merges = tokenizer_data["model"]["merges"] + logger.info(f"Found {len(merges)} merges in tokenizer.json model section") + else: + logger.warning("No merges found in tokenizer.json") + + # Convert merges to the format expected by GGUF + if merges: + # merges are in format [["token1", "token2"], ...] + # GGUF expects them as ["token1 token2", ...] + gguf_merges = [] + for merge in merges: + if len(merge) == 2: + gguf_merges.append(f"{merge[0]} {merge[1]}") + merges = gguf_merges + + # Add to GGUF + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + if merges: + self.gguf_writer.add_token_merges(merges) + + # Add special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + + logger.info(f"Successfully loaded T5Gemma vocabulary with {len(tokens)} tokens") + + except Exception as e: + logger.warning(f"Failed to load T5Gemma tokenizer directly: {e}") + self._set_vocab_gpt2() + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + + # Dynamically set special tokens from config instead of hardcoding + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 1: + # If multiple end tokens, use the second one as end_of_turn + special_vocab._set_special_token("end_of_turn", eos_token_ids[1]) + elif isinstance(eos_token_ids, list) and len(eos_token_ids) == 1: + # If only one end token, use it as end_of_turn + special_vocab._set_special_token("end_of_turn", eos_token_ids[0]) + + # Dynamically set start_of_turn, usually end_of_turn - 1 + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 1: + # Use end_of_turn - 1 as start_of_turn + start_of_turn_id = eos_token_ids[1] - 1 + special_vocab._set_special_token("start_of_turn", start_of_turn_id) + elif isinstance(eos_token_ids, list) and len(eos_token_ids) == 1: + # Use end_of_turn - 1 as start_of_turn + start_of_turn_id = eos_token_ids[0] - 1 + special_vocab._set_special_token("start_of_turn", start_of_turn_id) + + special_vocab.add_to_gguf(self.gguf_writer) + + if "pad_token_id" in self.hparams: + self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) + + # Dynamically set special token IDs + if "pad_token_id" in self.hparams: + self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) + + # Dynamically set multiple end tokens + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 0: + self.gguf_writer.add_eos_token_id(eos_token_ids[0]) # Primary end token + elif isinstance(eos_token_ids, int): + self.gguf_writer.add_eos_token_id(eos_token_ids) + + def set_gguf_parameters(self): + # Dynamically set encoder parameters + encoder_config = self.hparams["encoder"] + + if "max_position_embeddings" in encoder_config: + self.gguf_writer.add_context_length(encoder_config["max_position_embeddings"]) + if "hidden_size" in encoder_config: + self.gguf_writer.add_embedding_length(encoder_config["hidden_size"]) + if "num_hidden_layers" in encoder_config: + self.gguf_writer.add_block_count(encoder_config["num_hidden_layers"]) + if "intermediate_size" in encoder_config: + self.gguf_writer.add_feed_forward_length(encoder_config["intermediate_size"]) + if "num_attention_heads" in encoder_config: + self.gguf_writer.add_head_count(encoder_config["num_attention_heads"]) + if "num_key_value_heads" in encoder_config: + self.gguf_writer.add_head_count_kv(encoder_config["num_key_value_heads"]) + if "head_dim" in encoder_config: + self.gguf_writer.add_key_length(encoder_config["head_dim"]) + if "rms_norm_eps" in encoder_config: + self.gguf_writer.add_layer_norm_rms_eps(encoder_config["rms_norm_eps"]) + if "sliding_window" in encoder_config: + self.gguf_writer.add_sliding_window(encoder_config["sliding_window"]) + if "attn_logit_softcapping" in encoder_config: + self.gguf_writer.add_attn_logit_softcapping(encoder_config["attn_logit_softcapping"]) + if "final_logit_softcapping" in encoder_config: + self.gguf_writer.add_final_logit_softcapping(encoder_config["final_logit_softcapping"]) + if "rope_theta" in encoder_config: + self.gguf_writer.add_rope_freq_base(encoder_config["rope_theta"]) + + # Dynamically set decoder parameters + decoder_config = self.hparams["decoder"] + if "cross_attention_hidden_size" in decoder_config: + self.gguf_writer.add_key_value("cross_attention_hidden_size", decoder_config["cross_attention_hidden_size"], gguf.GGUFValueType.UINT32) + + # Dynamically set global parameters + if "vocab_size" in encoder_config: + self.gguf_writer.add_vocab_size(encoder_config["vocab_size"]) + + if "dropout_rate" in self.hparams: + self.gguf_writer.add_key_value("dropout_rate", self.hparams["dropout_rate"], gguf.GGUFValueType.FLOAT32) + if "classifier_dropout_rate" in self.hparams: + self.gguf_writer.add_key_value("classifier_dropout_rate", self.hparams["classifier_dropout_rate"], gguf.GGUFValueType.FLOAT32) + + if "initializer_range" in self.hparams: + self.gguf_writer.add_key_value("initializer_range", self.hparams["initializer_range"], gguf.GGUFValueType.FLOAT32) + + if "attention_bias" in encoder_config: + self.gguf_writer.add_key_value("attention_bias", encoder_config["attention_bias"], gguf.GGUFValueType.BOOL) + if "attention_dropout" in encoder_config: + self.gguf_writer.add_key_value("attention_dropout", encoder_config["attention_dropout"], gguf.GGUFValueType.FLOAT32) + if "query_pre_attn_scalar" in encoder_config: + self.gguf_writer.add_key_value("query_pre_attn_scalar", encoder_config["query_pre_attn_scalar"], gguf.GGUFValueType.UINT32) + + # Dynamically set encoder's other parameters + # Only include specific keys that are known to be useful for T5Gemma + encoder_keys_to_include = [ + "classifier_dropout_rate", "dropout_rate", "initializer_range", + "model_type", "torch_dtype", "use_cache", "hidden_activation" + ] + for key, value in encoder_config.items(): + if key in encoder_keys_to_include: + if isinstance(value, bool): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.BOOL) + elif isinstance(value, int): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.UINT32) + elif isinstance(value, float): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.FLOAT32) + elif isinstance(value, str): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.STRING) + + # Dynamically set decoder's other parameters + # Only include specific keys that are known to be useful for T5Gemma + decoder_keys_to_include = [ + "classifier_dropout_rate", "dropout_rate", "initializer_range", + "model_type", "torch_dtype", "use_cache", "hidden_activation", + "is_decoder", "max_position_embeddings", "hidden_size", + "intermediate_size", "num_attention_heads", "num_key_value_heads", + "head_dim", "rms_norm_eps", "sliding_window", "attn_logit_softcapping", + "final_logit_softcapping", "rope_theta", "attention_bias", + "attention_dropout", "query_pre_attn_scalar", "vocab_size" + ] + for key, value in decoder_config.items(): + if key in decoder_keys_to_include: + if isinstance(value, bool): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.BOOL) + elif isinstance(value, int): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.UINT32) + elif isinstance(value, float): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.FLOAT32) + elif isinstance(value, str): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.STRING) + + # T5 models typically use 32 relative attention buckets + self.gguf_writer.add_relative_attn_buckets_count(32) + + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5GEMMA models contain shared token embeddings tensors saved as either "model.decoder.embed_tokens.weight" + # or "model.encoder.embed_tokens.weight". We use the decoder one as the token embeddings for both encoder + # and decoder and ignore the encoder one. + if name in ["model.decoder.embed_tokens.weight", "model.encoder.embed_tokens.weight"]: + if not hasattr(self, 'shared_token_embeddings_found'): + self.shared_token_embeddings_found = False + if not self.shared_token_embeddings_found: + name = "model.decoder.embed_tokens.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # T5GEMMA tensor names are already in the correct format for mapping + # The tensor mapping in gguf-py/gguf/tensor_mapping.py already includes + # the T5GEMMA-specific mappings, so we don't need to convert them + + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + """Generate extra tensors that are not in the model weights but are needed for T5Gemma.""" + # Generate relative attention bias tensors for each layer + # These are typically initialized as zeros and learned during training + n_head_enc = self.hparams.get("encoder_num_attention_heads", 8) + n_head_dec = self.hparams.get("decoder_num_attention_heads", 8) + n_rel_attn_bkts = self.hparams.get("relative_buckets_count", 32) + + # Generate relative attention bias for encoder layers + for i in range(self.block_count): + # Encoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_enc = torch.zeros(n_rel_attn_bkts, n_head_enc, dtype=torch.float16) + yield self.format_tensor_name(gguf.MODEL_TENSOR.ENC_ATTN_REL_B, i), rel_bias_enc + + # Decoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_dec = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) + yield self.format_tensor_name(gguf.MODEL_TENSOR.DEC_ATTN_REL_B, i), rel_bias_dec + + # Decoder cross attention relative bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_cross = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) + yield self.format_tensor_name(gguf.MODEL_TENSOR.DEC_CROSS_ATTN_REL_B, i), rel_bias_cross + + @ModelBase.register("T5EncoderModel") class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a..ed9b8a97e0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -362,6 +362,7 @@ class MODEL_ARCH(IntEnum): BITNET = auto() T5 = auto() T5ENCODER = auto() + T5GEMMA = auto() # T5Gemma architecture JAIS = auto() NEMOTRON = auto() EXAONE = auto() @@ -528,6 +529,12 @@ class MODEL_TENSOR(IntEnum): DEC_FFN_DOWN = auto() DEC_FFN_UP = auto() DEC_OUTPUT_NORM = auto() + # T5GEMMA specific post layer normalization tensors + DEC_POST_SELF_ATTN_NORM = auto() + DEC_POST_CROSS_ATTN_NORM = auto() + DEC_POST_FFN_NORM = auto() + ENC_POST_SELF_ATTN_NORM = auto() + ENC_POST_FFN_NORM = auto() ENC_ATTN_NORM = auto() ENC_ATTN_Q = auto() ENC_ATTN_K = auto() @@ -693,6 +700,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.T5GEMMA: "t5gemma", # T5Gemma architecture MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", @@ -860,6 +868,12 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM: "dec.blk.{bid}.post_self_attn_norm", + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM: "dec.blk.{bid}.post_cross_attn_norm", + MODEL_TENSOR.DEC_POST_FFN_NORM: "dec.blk.{bid}.post_ffn_norm", + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM: "enc.blk.{bid}.post_self_attn_norm", + MODEL_TENSOR.ENC_POST_FFN_NORM: "enc.blk.{bid}.post_ffn_norm", MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", @@ -2238,6 +2252,45 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.T5GEMMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.DEC_ATTN_NORM, + MODEL_TENSOR.DEC_ATTN_Q, + MODEL_TENSOR.DEC_ATTN_K, + MODEL_TENSOR.DEC_ATTN_V, + MODEL_TENSOR.DEC_ATTN_OUT, + MODEL_TENSOR.DEC_ATTN_REL_B, + MODEL_TENSOR.DEC_CROSS_ATTN_NORM, + MODEL_TENSOR.DEC_CROSS_ATTN_Q, + MODEL_TENSOR.DEC_CROSS_ATTN_K, + MODEL_TENSOR.DEC_CROSS_ATTN_V, + MODEL_TENSOR.DEC_CROSS_ATTN_OUT, + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B, + MODEL_TENSOR.DEC_FFN_NORM, + MODEL_TENSOR.DEC_FFN_GATE, + MODEL_TENSOR.DEC_FFN_DOWN, + MODEL_TENSOR.DEC_FFN_UP, + MODEL_TENSOR.DEC_OUTPUT_NORM, + MODEL_TENSOR.ENC_ATTN_NORM, + MODEL_TENSOR.ENC_ATTN_Q, + MODEL_TENSOR.ENC_ATTN_K, + MODEL_TENSOR.ENC_ATTN_V, + MODEL_TENSOR.ENC_ATTN_OUT, + MODEL_TENSOR.ENC_ATTN_REL_B, + MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, + MODEL_TENSOR.ENC_FFN_DOWN, + MODEL_TENSOR.ENC_FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM, + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM, + MODEL_TENSOR.DEC_POST_FFN_NORM, + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM, + MODEL_TENSOR.ENC_POST_FFN_NORM, + ], MODEL_ARCH.JAIS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a0f11563ac..5a4e4ea958 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -27,6 +27,8 @@ class TensorNameMap: "embedding.word_embeddings", # chatglm "transformer.token_embeddings", # openelm "shared", # t5 + "model.decoder.embed_tokens", # t5gemma + "model.encoder.embed_tokens", # t5gemma "rwkv.embeddings", # rwkv6 "model.embeddings", # rwkv7 "model.word_embeddings", # bailingmoe @@ -887,22 +889,27 @@ class TensorNameMap: MODEL_TENSOR.DEC_ATTN_NORM: ( "decoder.block.{bid}.layer.0.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_self_attn_layernorm", # t5gemma ), MODEL_TENSOR.DEC_ATTN_Q: ( "decoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "model.decoder.layers.{bid}.self_attn.q_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_K: ( "decoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "model.decoder.layers.{bid}.self_attn.k_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_V: ( "decoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "model.decoder.layers.{bid}.self_attn.v_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_OUT: ( "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "model.decoder.layers.{bid}.self_attn.o_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_REL_B: ( @@ -911,22 +918,27 @@ class TensorNameMap: MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( "decoder.block.{bid}.layer.1.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_cross_attn_layernorm", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 + "model.decoder.layers.{bid}.cross_attn.q_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_K: ( "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 + "model.decoder.layers.{bid}.cross_attn.k_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_V: ( "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 + "model.decoder.layers.{bid}.cross_attn.v_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 + "model.decoder.layers.{bid}.cross_attn.o_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( @@ -935,43 +947,70 @@ class TensorNameMap: MODEL_TENSOR.DEC_FFN_NORM: ( "decoder.block.{bid}.layer.2.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.DEC_FFN_GATE: ( "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 + "model.decoder.layers.{bid}.mlp.gate_proj", # t5gemma ), MODEL_TENSOR.DEC_FFN_UP: ( "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 + "model.decoder.layers.{bid}.mlp.up_proj", # t5gemma ), MODEL_TENSOR.DEC_FFN_DOWN: ( "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 + "model.decoder.layers.{bid}.mlp.down_proj", # t5gemma ), MODEL_TENSOR.DEC_OUTPUT_NORM: ( "decoder.final_layer_norm", # t5 + "model.decoder.norm", # t5gemma + ), + + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM: ( + "model.decoder.layers.{bid}.post_self_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM: ( + "model.decoder.layers.{bid}.post_cross_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.DEC_POST_FFN_NORM: ( + "model.decoder.layers.{bid}.post_feedforward_layernorm", # t5gemma + ), + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM: ( + "model.encoder.layers.{bid}.post_self_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.ENC_POST_FFN_NORM: ( + "model.encoder.layers.{bid}.post_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.ENC_ATTN_NORM: ( "encoder.block.{bid}.layer.0.layer_norm", # t5 + "model.encoder.layers.{bid}.pre_self_attn_layernorm", # t5gemma ), MODEL_TENSOR.ENC_ATTN_Q: ( "encoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "model.encoder.layers.{bid}.self_attn.q_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_K: ( "encoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "model.encoder.layers.{bid}.self_attn.k_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_V: ( "encoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "model.encoder.layers.{bid}.self_attn.v_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_OUT: ( "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "model.encoder.layers.{bid}.self_attn.o_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_REL_B: ( @@ -980,25 +1019,30 @@ class TensorNameMap: MODEL_TENSOR.ENC_FFN_NORM: ( "encoder.block.{bid}.layer.1.layer_norm", # t5 + "model.encoder.layers.{bid}.pre_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.ENC_FFN_GATE: ( "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 + "model.encoder.layers.{bid}.mlp.gate_proj", # t5gemma ), MODEL_TENSOR.ENC_FFN_UP: ( "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 + "model.encoder.layers.{bid}.mlp.up_proj", # t5gemma ), MODEL_TENSOR.ENC_FFN_DOWN: ( "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 + "model.encoder.layers.{bid}.mlp.down_proj", # t5gemma ), ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 + "model.encoder.norm", # t5gemma "layer_norm", # neobert ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 18dcc6ddfe..0feb928a98 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, + { LLM_ARCH_T5GEMMA, "t5gemma" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, @@ -1499,6 +1500,46 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_T5GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, + { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" }, + { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" }, + { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" }, + { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" }, + { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" }, + { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" }, + { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" }, + { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" }, + { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" }, + { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" }, + { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" }, + { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" }, + { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" }, + { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" }, + { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" }, + { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + { LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, "dec.blk.%d.post_self_attn_norm" }, + { LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, "dec.blk.%d.post_cross_attn_norm" }, + { LLM_TENSOR_DEC_POST_FFN_NORM, "dec.blk.%d.post_ffn_norm" }, + { LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, "enc.blk.%d.post_self_attn_norm" }, + { LLM_TENSOR_ENC_POST_FFN_NORM, "enc.blk.%d.post_ffn_norm" }, + }, + }, { LLM_ARCH_JAIS, { @@ -2196,6 +2237,11 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ENC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, {LLM_TENSOR_ENC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_DEC_POST_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ENC_POST_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e795..4e67e2b241 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -70,6 +70,7 @@ enum llm_arch { LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, + LLM_ARCH_T5GEMMA, LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, @@ -381,6 +382,12 @@ enum llm_tensor { LLM_TENSOR_DEC_FFN_DOWN, LLM_TENSOR_DEC_FFN_UP, LLM_TENSOR_DEC_OUTPUT_NORM, + // T5GEMMA specific post layer normalization tensors + LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, + LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, + LLM_TENSOR_DEC_POST_FFN_NORM, + LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, + LLM_TENSOR_ENC_POST_FFN_NORM, LLM_TENSOR_ENC_ATTN_NORM, LLM_TENSOR_ENC_ATTN_Q, LLM_TENSOR_ENC_ATTN_K, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 26a5cf9c3f..911f24c567 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -915,7 +915,7 @@ int llama_context::encode(const llama_batch & batch_inp) { } // TODO: hacky solution - if (model.arch == LLM_ARCH_T5 && t_embd) { + if ((model.arch == LLM_ARCH_T5 || model.arch == LLM_ARCH_T5GEMMA) && t_embd) { //cross.t_embd = t_embd; synchronize(); @@ -1271,7 +1271,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { bool has_embd = cparams.embeddings; // TODO: hacky enc-dec support - if (model.arch == LLM_ARCH_T5) { + if (model.arch == LLM_ARCH_T5 || model.arch == LLM_ARCH_T5GEMMA) { has_logits = true; has_embd = true; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 58ca7df707..425b1e2cd2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1522,6 +1522,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); type = LLM_TYPE_UNKNOWN; } break; + case LLM_ARCH_T5GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + + uint32_t dec_start_token_id; + if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) { + hparams.dec_start_token_id = dec_start_token_id; + } + + // T5Gemma models have varying sizes, so we'll set type as unknown + type = LLM_TYPE_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -4343,6 +4356,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_T5GEMMA: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0); + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // T5GEMMA specific post layer normalization tensors for encoder + layer.post_self_attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_POST_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0); + // this tensor seems to be unused in HF transformers implementation + layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // T5GEMMA specific post layer normalization tensors for decoder + layer.post_self_attn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_cross_attn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_FFN_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_T5ENCODER: { const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; @@ -18389,6 +18466,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_T5GEMMA: + { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + llm = std::make_unique(*this, params); + break; + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + llm = std::make_unique(*this, params); + break; + default: + GGML_ABORT("invalid graph type"); + }; + } break; case LLM_ARCH_JAIS: { llm = std::make_unique(*this, params); @@ -18621,6 +18712,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: + case LLM_ARCH_T5GEMMA: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: @@ -18794,6 +18886,7 @@ bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: return true; case LLM_ARCH_T5ENCODER: return true; + case LLM_ARCH_T5GEMMA: return true; default: return false; } } @@ -18801,6 +18894,7 @@ bool llama_model_has_encoder(const llama_model * model) { bool llama_model_has_decoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5ENCODER: return false; + case LLM_ARCH_T5GEMMA: return true; default: return true; } } diff --git a/src/llama-model.h b/src/llama-model.h index 6fcd74d57f..dc9cbead89 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -243,6 +243,13 @@ struct llama_layer { struct ggml_tensor * ffn_norm_exps = nullptr; struct ggml_tensor * ffn_norm_enc = nullptr; + // T5GEMMA specific post layer normalization tensors + struct ggml_tensor * post_self_attn_norm = nullptr; + struct ggml_tensor * post_cross_attn_norm = nullptr; + struct ggml_tensor * post_ffn_norm = nullptr; + struct ggml_tensor * post_self_attn_norm_enc = nullptr; + struct ggml_tensor * post_ffn_norm_enc = nullptr; + // ff struct ggml_tensor * ffn_gate = nullptr; // w1 struct ggml_tensor * ffn_down = nullptr; // w2