From 93b322890ebecf85ed784a64137c7c050f88771e Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 6 Aug 2025 09:50:32 -0400 Subject: [PATCH 1/4] arch : add T5Gemma encoder-decoder architecture support (#14940) --- convert_hf_to_gguf.py | 328 +++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 53 ++++++ gguf-py/gguf/tensor_mapping.py | 44 +++++ src/llama-arch.cpp | 46 +++++ src/llama-arch.h | 7 + src/llama-context.cpp | 4 +- src/llama-model.cpp | 94 ++++++++++ src/llama-model.h | 7 + 8 files changed, 581 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82b005e84a8..6d5f9a7263e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6441,6 +6441,334 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("T5GemmaForConditionalGeneration") +class T5GemmaModel(TextModel): + model_arch = gguf.MODEL_ARCH.T5GEMMA + + def __init__(self, *args, **kwargs): + # Don't call super().__init__() because it tries to find standard layer count parameters + # that don't exist in T5Gemma models (they have encoder.num_hidden_layers instead) + + # Initialize basic attributes manually + self.dir_model = args[0] if args else kwargs.get('dir_model') + self.ftype = args[1] if len(args) > 1 else kwargs.get('ftype') + self.fname_out = args[2] if len(args) > 2 else kwargs.get('fname_out') + self.is_big_endian = kwargs.get('is_big_endian', False) + self.endianess = gguf.GGUFEndian.BIG if self.is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = kwargs.get('use_temp_file', False) + self.lazy = not kwargs.get('eager', False) + self.remote_hf_model_id = kwargs.get('remote_hf_model_id') + self.metadata_override = kwargs.get('metadata_override') + self.model_name = kwargs.get('model_name') + self.dir_model_card = self.dir_model + + # Load model parts + if self.remote_hf_model_id is not None: + self.is_safetensors = True + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info(f"Using remote model with HuggingFace id: {self.remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(self.remote_hf_model_id) + self.tensor_names = set(name for name in remote_tensors.keys()) + for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(self.remote_hf_model_id).items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + self.get_tensors = get_remote_tensors + else: + self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + + # Load hyperparameters + self.hparams = kwargs.get('hparams') or ModelBase.load_hparams(self.dir_model) + self.tensor_names = None + + # Apply heuristics to figure out typical tensor encoding + if self.ftype == gguf.LlamaFileType.GUESSED: + _, first_tensor = next(self.get_tensors()) + if first_tensor.dtype == torch.float16: + logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + else: + logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + + # Configure GGUF Writer + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch=gguf.MODEL_ARCH_NAMES[self.model_arch], + endianess=self.endianess, + use_temp_file=self.use_temp_file, + split_max_tensors=kwargs.get('split_max_tensors', 0), + split_max_size=kwargs.get('split_max_size', 0), + dry_run=kwargs.get('dry_run', False), + small_first_shard=kwargs.get('small_first_shard', False) + ) + + # T5Gemma specific initialization + self.is_encoder_decoder = True + + # Dynamically get encoder and decoder configurations + encoder_config = self.hparams.get("encoder", {}) + decoder_config = self.hparams.get("decoder", {}) + + # Dynamically set encoder and decoder layer counts + self.encoder_block_count = encoder_config.get("num_hidden_layers", 0) + self.decoder_block_count = decoder_config.get("num_hidden_layers", 0) + + # Set block_count to encoder_block_count for tensor mapping + self.block_count = self.encoder_block_count + + # Initialize tensor mapping using encoder layer count + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.encoder_block_count) + + def set_vocab(self): + # T5Gemma uses BPE tokenizer - read directly from tokenizer.json + import json + + tokenizer_json_path = self.dir_model / "tokenizer.json" + if not tokenizer_json_path.exists(): + logger.warning("tokenizer.json not found, falling back to GPT2 method") + self._set_vocab_gpt2() + return + + try: + with open(tokenizer_json_path, 'r', encoding='utf-8') as f: + tokenizer_data = json.load(f) + + # Extract vocabulary from tokenizer.json + vocab = tokenizer_data.get("model", {}).get("vocab", {}) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + + # Create tokens and types lists + tokens = [] + toktypes = [] + + # Create reverse mapping from id to token + id_to_token = {v: k for k, v in vocab.items()} + + for i in range(vocab_size): + if i in id_to_token: + token = id_to_token[i] + tokens.append(token) + # Check if it's a special token + if token in ['', '', '', '', '', '']: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + else: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + + # Extract merges from tokenizer.json if available + merges = [] + if "merges" in tokenizer_data and tokenizer_data["merges"]: + merges = tokenizer_data["merges"] + logger.info(f"Found {len(merges)} merges in tokenizer.json") + elif "model" in tokenizer_data and "merges" in tokenizer_data["model"]: + merges = tokenizer_data["model"]["merges"] + logger.info(f"Found {len(merges)} merges in tokenizer.json model section") + else: + logger.warning("No merges found in tokenizer.json") + + # Convert merges to the format expected by GGUF + if merges: + # merges are in format [["token1", "token2"], ...] + # GGUF expects them as ["token1 token2", ...] + gguf_merges = [] + for merge in merges: + if len(merge) == 2: + gguf_merges.append(f"{merge[0]} {merge[1]}") + merges = gguf_merges + + # Add to GGUF + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + if merges: + self.gguf_writer.add_token_merges(merges) + + # Add special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + + logger.info(f"Successfully loaded T5Gemma vocabulary with {len(tokens)} tokens") + + except Exception as e: + logger.warning(f"Failed to load T5Gemma tokenizer directly: {e}") + self._set_vocab_gpt2() + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + + # Dynamically set special tokens from config instead of hardcoding + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 1: + # If multiple end tokens, use the second one as end_of_turn + special_vocab._set_special_token("end_of_turn", eos_token_ids[1]) + elif isinstance(eos_token_ids, list) and len(eos_token_ids) == 1: + # If only one end token, use it as end_of_turn + special_vocab._set_special_token("end_of_turn", eos_token_ids[0]) + + # Dynamically set start_of_turn, usually end_of_turn - 1 + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 1: + # Use end_of_turn - 1 as start_of_turn + start_of_turn_id = eos_token_ids[1] - 1 + special_vocab._set_special_token("start_of_turn", start_of_turn_id) + elif isinstance(eos_token_ids, list) and len(eos_token_ids) == 1: + # Use end_of_turn - 1 as start_of_turn + start_of_turn_id = eos_token_ids[0] - 1 + special_vocab._set_special_token("start_of_turn", start_of_turn_id) + + special_vocab.add_to_gguf(self.gguf_writer) + + if "pad_token_id" in self.hparams: + self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) + + # Dynamically set special token IDs + if "pad_token_id" in self.hparams: + self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) + + # Dynamically set multiple end tokens + if "eos_token_id" in self.hparams: + eos_token_ids = self.hparams["eos_token_id"] + if isinstance(eos_token_ids, list) and len(eos_token_ids) > 0: + self.gguf_writer.add_eos_token_id(eos_token_ids[0]) # Primary end token + elif isinstance(eos_token_ids, int): + self.gguf_writer.add_eos_token_id(eos_token_ids) + + def set_gguf_parameters(self): + # Dynamically set encoder parameters + encoder_config = self.hparams["encoder"] + + if "max_position_embeddings" in encoder_config: + self.gguf_writer.add_context_length(encoder_config["max_position_embeddings"]) + if "hidden_size" in encoder_config: + self.gguf_writer.add_embedding_length(encoder_config["hidden_size"]) + if "num_hidden_layers" in encoder_config: + self.gguf_writer.add_block_count(encoder_config["num_hidden_layers"]) + if "intermediate_size" in encoder_config: + self.gguf_writer.add_feed_forward_length(encoder_config["intermediate_size"]) + if "num_attention_heads" in encoder_config: + self.gguf_writer.add_head_count(encoder_config["num_attention_heads"]) + if "num_key_value_heads" in encoder_config: + self.gguf_writer.add_head_count_kv(encoder_config["num_key_value_heads"]) + if "head_dim" in encoder_config: + self.gguf_writer.add_key_length(encoder_config["head_dim"]) + if "rms_norm_eps" in encoder_config: + self.gguf_writer.add_layer_norm_rms_eps(encoder_config["rms_norm_eps"]) + if "sliding_window" in encoder_config: + self.gguf_writer.add_sliding_window(encoder_config["sliding_window"]) + if "attn_logit_softcapping" in encoder_config: + self.gguf_writer.add_attn_logit_softcapping(encoder_config["attn_logit_softcapping"]) + if "final_logit_softcapping" in encoder_config: + self.gguf_writer.add_final_logit_softcapping(encoder_config["final_logit_softcapping"]) + if "rope_theta" in encoder_config: + self.gguf_writer.add_rope_freq_base(encoder_config["rope_theta"]) + + # Dynamically set decoder parameters + decoder_config = self.hparams["decoder"] + if "cross_attention_hidden_size" in decoder_config: + self.gguf_writer.add_key_value("cross_attention_hidden_size", decoder_config["cross_attention_hidden_size"], gguf.GGUFValueType.UINT32) + + # Dynamically set global parameters + if "vocab_size" in encoder_config: + self.gguf_writer.add_vocab_size(encoder_config["vocab_size"]) + + if "dropout_rate" in self.hparams: + self.gguf_writer.add_key_value("dropout_rate", self.hparams["dropout_rate"], gguf.GGUFValueType.FLOAT32) + if "classifier_dropout_rate" in self.hparams: + self.gguf_writer.add_key_value("classifier_dropout_rate", self.hparams["classifier_dropout_rate"], gguf.GGUFValueType.FLOAT32) + + if "initializer_range" in self.hparams: + self.gguf_writer.add_key_value("initializer_range", self.hparams["initializer_range"], gguf.GGUFValueType.FLOAT32) + + if "attention_bias" in encoder_config: + self.gguf_writer.add_key_value("attention_bias", encoder_config["attention_bias"], gguf.GGUFValueType.BOOL) + if "attention_dropout" in encoder_config: + self.gguf_writer.add_key_value("attention_dropout", encoder_config["attention_dropout"], gguf.GGUFValueType.FLOAT32) + if "query_pre_attn_scalar" in encoder_config: + self.gguf_writer.add_key_value("query_pre_attn_scalar", encoder_config["query_pre_attn_scalar"], gguf.GGUFValueType.UINT32) + + # Dynamically set encoder's other parameters + for key, value in encoder_config.items(): + if key not in ["max_position_embeddings", "hidden_size", "num_hidden_layers", "intermediate_size", + "num_attention_heads", "num_key_value_heads", "head_dim", "rms_norm_eps", + "sliding_window", "attn_logit_softcapping", "final_logit_softcapping", + "rope_theta", "attention_bias", "attention_dropout", "query_pre_attn_scalar", "vocab_size"]: + if isinstance(value, bool): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.BOOL) + elif isinstance(value, int): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.UINT32) + elif isinstance(value, float): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.FLOAT32) + elif isinstance(value, str): + self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.STRING) + + # Dynamically set decoder's other parameters + for key, value in decoder_config.items(): + if key not in ["cross_attention_hidden_size"]: + if isinstance(value, bool): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.BOOL) + elif isinstance(value, int): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.UINT32) + elif isinstance(value, float): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.FLOAT32) + elif isinstance(value, str): + self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.STRING) + + # T5 models typically use 32 relative attention buckets + self.gguf_writer.add_relative_attn_buckets_count(32) + + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5GEMMA models contain shared token embeddings tensors saved as either "model.decoder.embed_tokens.weight" + # or "model.encoder.embed_tokens.weight". We use the decoder one as the token embeddings for both encoder + # and decoder and ignore the encoder one. + if name in ["model.decoder.embed_tokens.weight", "model.encoder.embed_tokens.weight"]: + if not hasattr(self, 'shared_token_embeddings_found'): + self.shared_token_embeddings_found = False + if not self.shared_token_embeddings_found: + name = "model.decoder.embed_tokens.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # T5GEMMA tensor names are already in the correct format for mapping + # The tensor mapping in gguf-py/gguf/tensor_mapping.py already includes + # the T5GEMMA-specific mappings, so we don't need to convert them + + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + """Generate extra tensors that are not in the model weights but are needed for T5Gemma.""" + # Generate relative attention bias tensors for each layer + # These are typically initialized as zeros and learned during training + n_head_enc = self.hparams.get("encoder_num_attention_heads", 8) + n_head_dec = self.hparams.get("decoder_num_attention_heads", 8) + n_rel_attn_bkts = self.hparams.get("relative_buckets_count", 32) + + # Generate relative attention bias for encoder layers + for i in range(self.block_count): + # Encoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_enc = torch.zeros(n_rel_attn_bkts, n_head_enc, dtype=torch.float16) + yield f"enc.blk.{i}.attn_rel_b.weight", rel_bias_enc + + # Decoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_dec = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) + yield f"dec.blk.{i}.attn_rel_b.weight", rel_bias_dec + + # Decoder cross attention relative bias - shape should be (n_rel_attn_bkts, n_head) + rel_bias_cross = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) + yield f"dec.blk.{i}.cross_attn_rel_b.weight", rel_bias_cross + + @ModelBase.register("T5EncoderModel") class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 911eea504a1..ed9b8a97e0b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -362,6 +362,7 @@ class MODEL_ARCH(IntEnum): BITNET = auto() T5 = auto() T5ENCODER = auto() + T5GEMMA = auto() # T5Gemma architecture JAIS = auto() NEMOTRON = auto() EXAONE = auto() @@ -528,6 +529,12 @@ class MODEL_TENSOR(IntEnum): DEC_FFN_DOWN = auto() DEC_FFN_UP = auto() DEC_OUTPUT_NORM = auto() + # T5GEMMA specific post layer normalization tensors + DEC_POST_SELF_ATTN_NORM = auto() + DEC_POST_CROSS_ATTN_NORM = auto() + DEC_POST_FFN_NORM = auto() + ENC_POST_SELF_ATTN_NORM = auto() + ENC_POST_FFN_NORM = auto() ENC_ATTN_NORM = auto() ENC_ATTN_Q = auto() ENC_ATTN_K = auto() @@ -693,6 +700,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.T5GEMMA: "t5gemma", # T5Gemma architecture MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", @@ -860,6 +868,12 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM: "dec.blk.{bid}.post_self_attn_norm", + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM: "dec.blk.{bid}.post_cross_attn_norm", + MODEL_TENSOR.DEC_POST_FFN_NORM: "dec.blk.{bid}.post_ffn_norm", + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM: "enc.blk.{bid}.post_self_attn_norm", + MODEL_TENSOR.ENC_POST_FFN_NORM: "enc.blk.{bid}.post_ffn_norm", MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", @@ -2238,6 +2252,45 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.T5GEMMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.DEC_ATTN_NORM, + MODEL_TENSOR.DEC_ATTN_Q, + MODEL_TENSOR.DEC_ATTN_K, + MODEL_TENSOR.DEC_ATTN_V, + MODEL_TENSOR.DEC_ATTN_OUT, + MODEL_TENSOR.DEC_ATTN_REL_B, + MODEL_TENSOR.DEC_CROSS_ATTN_NORM, + MODEL_TENSOR.DEC_CROSS_ATTN_Q, + MODEL_TENSOR.DEC_CROSS_ATTN_K, + MODEL_TENSOR.DEC_CROSS_ATTN_V, + MODEL_TENSOR.DEC_CROSS_ATTN_OUT, + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B, + MODEL_TENSOR.DEC_FFN_NORM, + MODEL_TENSOR.DEC_FFN_GATE, + MODEL_TENSOR.DEC_FFN_DOWN, + MODEL_TENSOR.DEC_FFN_UP, + MODEL_TENSOR.DEC_OUTPUT_NORM, + MODEL_TENSOR.ENC_ATTN_NORM, + MODEL_TENSOR.ENC_ATTN_Q, + MODEL_TENSOR.ENC_ATTN_K, + MODEL_TENSOR.ENC_ATTN_V, + MODEL_TENSOR.ENC_ATTN_OUT, + MODEL_TENSOR.ENC_ATTN_REL_B, + MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, + MODEL_TENSOR.ENC_FFN_DOWN, + MODEL_TENSOR.ENC_FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM, + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM, + MODEL_TENSOR.DEC_POST_FFN_NORM, + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM, + MODEL_TENSOR.ENC_POST_FFN_NORM, + ], MODEL_ARCH.JAIS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a0f11563acb..42027ceba33 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -27,6 +27,8 @@ class TensorNameMap: "embedding.word_embeddings", # chatglm "transformer.token_embeddings", # openelm "shared", # t5 + "model.decoder.embed_tokens", # t5gemma + "model.encoder.embed_tokens", # t5gemma "rwkv.embeddings", # rwkv6 "model.embeddings", # rwkv7 "model.word_embeddings", # bailingmoe @@ -887,22 +889,27 @@ class TensorNameMap: MODEL_TENSOR.DEC_ATTN_NORM: ( "decoder.block.{bid}.layer.0.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_self_attn_layernorm", # t5gemma ), MODEL_TENSOR.DEC_ATTN_Q: ( "decoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "model.decoder.layers.{bid}.self_attn.q_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_K: ( "decoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "model.decoder.layers.{bid}.self_attn.k_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_V: ( "decoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "model.decoder.layers.{bid}.self_attn.v_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_OUT: ( "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "model.decoder.layers.{bid}.self_attn.o_proj", # t5gemma ), MODEL_TENSOR.DEC_ATTN_REL_B: ( @@ -911,22 +918,27 @@ class TensorNameMap: MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( "decoder.block.{bid}.layer.1.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_cross_attn_layernorm", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 + "model.decoder.layers.{bid}.cross_attn.q_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_K: ( "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 + "model.decoder.layers.{bid}.cross_attn.k_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_V: ( "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 + "model.decoder.layers.{bid}.cross_attn.v_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 + "model.decoder.layers.{bid}.cross_attn.o_proj", # t5gemma ), MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( @@ -935,43 +947,70 @@ class TensorNameMap: MODEL_TENSOR.DEC_FFN_NORM: ( "decoder.block.{bid}.layer.2.layer_norm", # t5 + "model.decoder.layers.{bid}.pre_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.DEC_FFN_GATE: ( "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 + "model.decoder.layers.{bid}.mlp.gate_proj", # t5gemma ), MODEL_TENSOR.DEC_FFN_UP: ( "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 + "model.decoder.layers.{bid}.mlp.up_proj", # t5gemma ), MODEL_TENSOR.DEC_FFN_DOWN: ( "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 + "model.decoder.layers.{bid}.mlp.down_proj", # t5gemma ), MODEL_TENSOR.DEC_OUTPUT_NORM: ( "decoder.final_layer_norm", # t5 + "model.decoder.norm", # t5gemma + ), + + # T5GEMMA specific post layer normalization tensors + MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM: ( + "model.decoder.layers.{bid}.post_self_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.DEC_POST_CROSS_ATTN_NORM: ( + "model.decoder.layers.{bid}.post_cross_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.DEC_POST_FFN_NORM: ( + "model.decoder.layers.{bid}.post_feedforward_layernorm", # t5gemma + ), + MODEL_TENSOR.ENC_POST_SELF_ATTN_NORM: ( + "model.encoder.layers.{bid}.post_self_attn_layernorm", # t5gemma + ), + MODEL_TENSOR.ENC_POST_FFN_NORM: ( + "model.encoder.layers.{bid}.post_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.ENC_ATTN_NORM: ( "encoder.block.{bid}.layer.0.layer_norm", # t5 + "model.encoder.layers.{bid}.pre_self_attn_layernorm", # t5gemma ), MODEL_TENSOR.ENC_ATTN_Q: ( "encoder.block.{bid}.layer.0.SelfAttention.q", # t5 + "model.encoder.layers.{bid}.self_attn.q_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_K: ( "encoder.block.{bid}.layer.0.SelfAttention.k", # t5 + "model.encoder.layers.{bid}.self_attn.k_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_V: ( "encoder.block.{bid}.layer.0.SelfAttention.v", # t5 + "model.encoder.layers.{bid}.self_attn.v_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_OUT: ( "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "model.encoder.layers.{bid}.self_attn.o_proj", # t5gemma ), MODEL_TENSOR.ENC_ATTN_REL_B: ( @@ -980,25 +1019,30 @@ class TensorNameMap: MODEL_TENSOR.ENC_FFN_NORM: ( "encoder.block.{bid}.layer.1.layer_norm", # t5 + "model.encoder.layers.{bid}.pre_feedforward_layernorm", # t5gemma ), MODEL_TENSOR.ENC_FFN_GATE: ( "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 + "model.encoder.layers.{bid}.mlp.gate_proj", # t5gemma ), MODEL_TENSOR.ENC_FFN_UP: ( "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 + "model.encoder.layers.{bid}.mlp.up_proj", # t5gemma ), MODEL_TENSOR.ENC_FFN_DOWN: ( "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 + "model.encoder.layers.{bid}.mlp.down_proj", # t5gemma ), ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 + "model.encoder.norm", # t5gemma "layer_norm", # neobert ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 18dcc6ddfe5..0feb928a981 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, + { LLM_ARCH_T5GEMMA, "t5gemma" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, @@ -1499,6 +1500,46 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_T5GEMMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, + { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" }, + { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" }, + { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" }, + { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" }, + { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" }, + { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" }, + { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" }, + { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" }, + { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" }, + { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" }, + { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" }, + { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" }, + { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" }, + { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" }, + { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" }, + { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + { LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, "dec.blk.%d.post_self_attn_norm" }, + { LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, "dec.blk.%d.post_cross_attn_norm" }, + { LLM_TENSOR_DEC_POST_FFN_NORM, "dec.blk.%d.post_ffn_norm" }, + { LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, "enc.blk.%d.post_self_attn_norm" }, + { LLM_TENSOR_ENC_POST_FFN_NORM, "enc.blk.%d.post_ffn_norm" }, + }, + }, { LLM_ARCH_JAIS, { @@ -2196,6 +2237,11 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ENC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, {LLM_TENSOR_ENC_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_DEC_POST_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ENC_POST_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e7951..4e67e2b2412 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -70,6 +70,7 @@ enum llm_arch { LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, + LLM_ARCH_T5GEMMA, LLM_ARCH_JAIS, LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, @@ -381,6 +382,12 @@ enum llm_tensor { LLM_TENSOR_DEC_FFN_DOWN, LLM_TENSOR_DEC_FFN_UP, LLM_TENSOR_DEC_OUTPUT_NORM, + // T5GEMMA specific post layer normalization tensors + LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, + LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, + LLM_TENSOR_DEC_POST_FFN_NORM, + LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, + LLM_TENSOR_ENC_POST_FFN_NORM, LLM_TENSOR_ENC_ATTN_NORM, LLM_TENSOR_ENC_ATTN_Q, LLM_TENSOR_ENC_ATTN_K, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 26a5cf9c3f8..911f24c567a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -915,7 +915,7 @@ int llama_context::encode(const llama_batch & batch_inp) { } // TODO: hacky solution - if (model.arch == LLM_ARCH_T5 && t_embd) { + if ((model.arch == LLM_ARCH_T5 || model.arch == LLM_ARCH_T5GEMMA) && t_embd) { //cross.t_embd = t_embd; synchronize(); @@ -1271,7 +1271,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { bool has_embd = cparams.embeddings; // TODO: hacky enc-dec support - if (model.arch == LLM_ARCH_T5) { + if (model.arch == LLM_ARCH_T5 || model.arch == LLM_ARCH_T5GEMMA) { has_logits = true; has_embd = true; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 58ca7df707e..425b1e2cd23 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1522,6 +1522,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); type = LLM_TYPE_UNKNOWN; } break; + case LLM_ARCH_T5GEMMA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + + uint32_t dec_start_token_id; + if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) { + hparams.dec_start_token_id = dec_start_token_id; + } + + // T5Gemma models have varying sizes, so we'll set type as unknown + type = LLM_TYPE_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -4343,6 +4356,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_T5GEMMA: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0); + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // T5GEMMA specific post layer normalization tensors for encoder + layer.post_self_attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_POST_SELF_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_POST_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0); + // this tensor seems to be unused in HF transformers implementation + layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); + + layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // T5GEMMA specific post layer normalization tensors for decoder + layer.post_self_attn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_SELF_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_cross_attn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.post_ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_POST_FFN_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_T5ENCODER: { const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; @@ -18389,6 +18466,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_T5GEMMA: + { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + llm = std::make_unique(*this, params); + break; + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + llm = std::make_unique(*this, params); + break; + default: + GGML_ABORT("invalid graph type"); + }; + } break; case LLM_ARCH_JAIS: { llm = std::make_unique(*this, params); @@ -18621,6 +18712,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: + case LLM_ARCH_T5GEMMA: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: @@ -18794,6 +18886,7 @@ bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: return true; case LLM_ARCH_T5ENCODER: return true; + case LLM_ARCH_T5GEMMA: return true; default: return false; } } @@ -18801,6 +18894,7 @@ bool llama_model_has_encoder(const llama_model * model) { bool llama_model_has_decoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5ENCODER: return false; + case LLM_ARCH_T5GEMMA: return true; default: return true; } } diff --git a/src/llama-model.h b/src/llama-model.h index 6fcd74d57fd..dc9cbead891 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -243,6 +243,13 @@ struct llama_layer { struct ggml_tensor * ffn_norm_exps = nullptr; struct ggml_tensor * ffn_norm_enc = nullptr; + // T5GEMMA specific post layer normalization tensors + struct ggml_tensor * post_self_attn_norm = nullptr; + struct ggml_tensor * post_cross_attn_norm = nullptr; + struct ggml_tensor * post_ffn_norm = nullptr; + struct ggml_tensor * post_self_attn_norm_enc = nullptr; + struct ggml_tensor * post_ffn_norm_enc = nullptr; + // ff struct ggml_tensor * ffn_gate = nullptr; // w1 struct ggml_tensor * ffn_down = nullptr; // w2 From f5144c138b4a0ab7917bcfda34b2aec2ec420690 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 6 Aug 2025 09:55:38 -0400 Subject: [PATCH 2/4] fix: add type safety checks for T5Gemma model initialization --- convert_hf_to_gguf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6d5f9a7263e..1a0dafef333 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6451,8 +6451,14 @@ def __init__(self, *args, **kwargs): # Initialize basic attributes manually self.dir_model = args[0] if args else kwargs.get('dir_model') + if self.dir_model is None: + raise ValueError("dir_model is required") self.ftype = args[1] if len(args) > 1 else kwargs.get('ftype') + if self.ftype is None: + raise ValueError("ftype is required") self.fname_out = args[2] if len(args) > 2 else kwargs.get('fname_out') + if self.fname_out is None: + raise ValueError("fname_out is required") self.is_big_endian = kwargs.get('is_big_endian', False) self.endianess = gguf.GGUFEndian.BIG if self.is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = kwargs.get('use_temp_file', False) @@ -6466,6 +6472,8 @@ def __init__(self, *args, **kwargs): if self.remote_hf_model_id is not None: self.is_safetensors = True def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + if self.remote_hf_model_id is None: + raise ValueError("remote_hf_model_id is required for remote models") logger.info(f"Using remote model with HuggingFace id: {self.remote_hf_model_id}") remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(self.remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) From f892c18e221af5062552f3bdf5051b26da5b3f9a Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 6 Aug 2025 18:42:38 -0400 Subject: [PATCH 3/4] arch : add T5Gemma encoder-decoder architecture support with improvements (#14940) - Add T5Gemma model support with proper encoder-decoder architecture - Use super().__init__() instead of manual initialization for better inheritance - Use format_tensor_name() for consistent tensor naming - Explicitly enumerate included keys instead of excluding keys - Add proper type annotations for better type safety - Fix all trailing whitespace issues - Support relative attention bias tensors generation - Handle T5Gemma-specific post-layer normalization tensors - Implement proper tokenizer handling for BPE tokenizer - Add comprehensive tensor mapping for all T5Gemma components --- convert_hf_to_gguf.py | 175 +++++++++++++-------------------- gguf-py/gguf/tensor_mapping.py | 2 +- 2 files changed, 68 insertions(+), 109 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1a0dafef333..3d86d9a7eae 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6446,114 +6446,61 @@ class T5GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.T5GEMMA def __init__(self, *args, **kwargs): - # Don't call super().__init__() because it tries to find standard layer count parameters - # that don't exist in T5Gemma models (they have encoder.num_hidden_layers instead) - - # Initialize basic attributes manually - self.dir_model = args[0] if args else kwargs.get('dir_model') - if self.dir_model is None: + # Load hyperparameters first to modify them for super().__init__() + dir_model: Path = args[0] if args else kwargs.get('dir_model') + if dir_model is None: raise ValueError("dir_model is required") - self.ftype = args[1] if len(args) > 1 else kwargs.get('ftype') - if self.ftype is None: - raise ValueError("ftype is required") - self.fname_out = args[2] if len(args) > 2 else kwargs.get('fname_out') - if self.fname_out is None: - raise ValueError("fname_out is required") - self.is_big_endian = kwargs.get('is_big_endian', False) - self.endianess = gguf.GGUFEndian.BIG if self.is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = kwargs.get('use_temp_file', False) - self.lazy = not kwargs.get('eager', False) - self.remote_hf_model_id = kwargs.get('remote_hf_model_id') - self.metadata_override = kwargs.get('metadata_override') - self.model_name = kwargs.get('model_name') - self.dir_model_card = self.dir_model - - # Load model parts - if self.remote_hf_model_id is not None: - self.is_safetensors = True - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: - if self.remote_hf_model_id is None: - raise ValueError("remote_hf_model_id is required for remote models") - logger.info(f"Using remote model with HuggingFace id: {self.remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(self.remote_hf_model_id) - self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(self.remote_hf_model_id).items(): - yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) - self.get_tensors = get_remote_tensors - else: - self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - - # Load hyperparameters - self.hparams = kwargs.get('hparams') or ModelBase.load_hparams(self.dir_model) - self.tensor_names = None - - # Apply heuristics to figure out typical tensor encoding - if self.ftype == gguf.LlamaFileType.GUESSED: - _, first_tensor = next(self.get_tensors()) - if first_tensor.dtype == torch.float16: - logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - else: - logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - - # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter( - path=None, - arch=gguf.MODEL_ARCH_NAMES[self.model_arch], - endianess=self.endianess, - use_temp_file=self.use_temp_file, - split_max_tensors=kwargs.get('split_max_tensors', 0), - split_max_size=kwargs.get('split_max_size', 0), - dry_run=kwargs.get('dry_run', False), - small_first_shard=kwargs.get('small_first_shard', False) - ) - + + hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model) + encoder_config = hparams.get("encoder", {}) + # Add num_hidden_layers to hparams so super().__init__() can find it + hparams["num_hidden_layers"] = encoder_config.get("num_hidden_layers", 0) + kwargs["hparams"] = hparams + + # Now call super().__init__() with modified hparams + super().__init__(*args, **kwargs) + # T5Gemma specific initialization self.is_encoder_decoder = True - + # Dynamically get encoder and decoder configurations - encoder_config = self.hparams.get("encoder", {}) decoder_config = self.hparams.get("decoder", {}) - + # Dynamically set encoder and decoder layer counts self.encoder_block_count = encoder_config.get("num_hidden_layers", 0) self.decoder_block_count = decoder_config.get("num_hidden_layers", 0) - + # Set block_count to encoder_block_count for tensor mapping self.block_count = self.encoder_block_count - + # Initialize tensor mapping using encoder layer count self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.encoder_block_count) def set_vocab(self): # T5Gemma uses BPE tokenizer - read directly from tokenizer.json import json - + tokenizer_json_path = self.dir_model / "tokenizer.json" if not tokenizer_json_path.exists(): logger.warning("tokenizer.json not found, falling back to GPT2 method") self._set_vocab_gpt2() return - + try: with open(tokenizer_json_path, 'r', encoding='utf-8') as f: tokenizer_data = json.load(f) - + # Extract vocabulary from tokenizer.json vocab = tokenizer_data.get("model", {}).get("vocab", {}) vocab_size = self.hparams.get("vocab_size", len(vocab)) - + # Create tokens and types lists tokens = [] toktypes = [] - + # Create reverse mapping from id to token id_to_token = {v: k for k, v in vocab.items()} - + for i in range(vocab_size): if i in id_to_token: token = id_to_token[i] @@ -6566,7 +6513,7 @@ def set_vocab(self): else: tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.UNUSED) - + # Extract merges from tokenizer.json if available merges = [] if "merges" in tokenizer_data and tokenizer_data["merges"]: @@ -6577,7 +6524,7 @@ def set_vocab(self): logger.info(f"Found {len(merges)} merges in tokenizer.json model section") else: logger.warning("No merges found in tokenizer.json") - + # Convert merges to the format expected by GGUF if merges: # merges are in format [["token1", "token2"], ...] @@ -6587,7 +6534,7 @@ def set_vocab(self): if len(merge) == 2: gguf_merges.append(f"{merge[0]} {merge[1]}") merges = gguf_merges - + # Add to GGUF self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre("default") @@ -6595,19 +6542,19 @@ def set_vocab(self): self.gguf_writer.add_token_types(toktypes) if merges: self.gguf_writer.add_token_merges(merges) - + # Add special tokens special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab.add_to_gguf(self.gguf_writer) - + logger.info(f"Successfully loaded T5Gemma vocabulary with {len(tokens)} tokens") - + except Exception as e: logger.warning(f"Failed to load T5Gemma tokenizer directly: {e}") self._set_vocab_gpt2() - + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - + # Dynamically set special tokens from config instead of hardcoding if "eos_token_id" in self.hparams: eos_token_ids = self.hparams["eos_token_id"] @@ -6617,7 +6564,7 @@ def set_vocab(self): elif isinstance(eos_token_ids, list) and len(eos_token_ids) == 1: # If only one end token, use it as end_of_turn special_vocab._set_special_token("end_of_turn", eos_token_ids[0]) - + # Dynamically set start_of_turn, usually end_of_turn - 1 if "eos_token_id" in self.hparams: eos_token_ids = self.hparams["eos_token_id"] @@ -6629,16 +6576,16 @@ def set_vocab(self): # Use end_of_turn - 1 as start_of_turn start_of_turn_id = eos_token_ids[0] - 1 special_vocab._set_special_token("start_of_turn", start_of_turn_id) - + special_vocab.add_to_gguf(self.gguf_writer) - + if "pad_token_id" in self.hparams: self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) # Dynamically set special token IDs if "pad_token_id" in self.hparams: self.gguf_writer.add_pad_token_id(self.hparams["pad_token_id"]) - + # Dynamically set multiple end tokens if "eos_token_id" in self.hparams: eos_token_ids = self.hparams["eos_token_id"] @@ -6650,7 +6597,7 @@ def set_vocab(self): def set_gguf_parameters(self): # Dynamically set encoder parameters encoder_config = self.hparams["encoder"] - + if "max_position_embeddings" in encoder_config: self.gguf_writer.add_context_length(encoder_config["max_position_embeddings"]) if "hidden_size" in encoder_config: @@ -6680,32 +6627,34 @@ def set_gguf_parameters(self): decoder_config = self.hparams["decoder"] if "cross_attention_hidden_size" in decoder_config: self.gguf_writer.add_key_value("cross_attention_hidden_size", decoder_config["cross_attention_hidden_size"], gguf.GGUFValueType.UINT32) - + # Dynamically set global parameters if "vocab_size" in encoder_config: self.gguf_writer.add_vocab_size(encoder_config["vocab_size"]) - + if "dropout_rate" in self.hparams: self.gguf_writer.add_key_value("dropout_rate", self.hparams["dropout_rate"], gguf.GGUFValueType.FLOAT32) if "classifier_dropout_rate" in self.hparams: self.gguf_writer.add_key_value("classifier_dropout_rate", self.hparams["classifier_dropout_rate"], gguf.GGUFValueType.FLOAT32) - + if "initializer_range" in self.hparams: self.gguf_writer.add_key_value("initializer_range", self.hparams["initializer_range"], gguf.GGUFValueType.FLOAT32) - + if "attention_bias" in encoder_config: self.gguf_writer.add_key_value("attention_bias", encoder_config["attention_bias"], gguf.GGUFValueType.BOOL) if "attention_dropout" in encoder_config: self.gguf_writer.add_key_value("attention_dropout", encoder_config["attention_dropout"], gguf.GGUFValueType.FLOAT32) if "query_pre_attn_scalar" in encoder_config: self.gguf_writer.add_key_value("query_pre_attn_scalar", encoder_config["query_pre_attn_scalar"], gguf.GGUFValueType.UINT32) - + # Dynamically set encoder's other parameters + # Only include specific keys that are known to be useful for T5Gemma + encoder_keys_to_include = [ + "classifier_dropout_rate", "dropout_rate", "initializer_range", + "model_type", "torch_dtype", "use_cache", "hidden_activation" + ] for key, value in encoder_config.items(): - if key not in ["max_position_embeddings", "hidden_size", "num_hidden_layers", "intermediate_size", - "num_attention_heads", "num_key_value_heads", "head_dim", "rms_norm_eps", - "sliding_window", "attn_logit_softcapping", "final_logit_softcapping", - "rope_theta", "attention_bias", "attention_dropout", "query_pre_attn_scalar", "vocab_size"]: + if key in encoder_keys_to_include: if isinstance(value, bool): self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.BOOL) elif isinstance(value, int): @@ -6714,10 +6663,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.FLOAT32) elif isinstance(value, str): self.gguf_writer.add_key_value(f"encoder_{key}", value, gguf.GGUFValueType.STRING) - + # Dynamically set decoder's other parameters + # Only include specific keys that are known to be useful for T5Gemma + decoder_keys_to_include = [ + "classifier_dropout_rate", "dropout_rate", "initializer_range", + "model_type", "torch_dtype", "use_cache", "hidden_activation", + "is_decoder", "max_position_embeddings", "hidden_size", + "intermediate_size", "num_attention_heads", "num_key_value_heads", + "head_dim", "rms_norm_eps", "sliding_window", "attn_logit_softcapping", + "final_logit_softcapping", "rope_theta", "attention_bias", + "attention_dropout", "query_pre_attn_scalar", "vocab_size" + ] for key, value in decoder_config.items(): - if key not in ["cross_attention_hidden_size"]: + if key in decoder_keys_to_include: if isinstance(value, bool): self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.BOOL) elif isinstance(value, int): @@ -6726,10 +6685,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.FLOAT32) elif isinstance(value, str): self.gguf_writer.add_key_value(f"decoder_{key}", value, gguf.GGUFValueType.STRING) - + # T5 models typically use 32 relative attention buckets self.gguf_writer.add_relative_attn_buckets_count(32) - + self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -6761,20 +6720,20 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_head_enc = self.hparams.get("encoder_num_attention_heads", 8) n_head_dec = self.hparams.get("decoder_num_attention_heads", 8) n_rel_attn_bkts = self.hparams.get("relative_buckets_count", 32) - + # Generate relative attention bias for encoder layers for i in range(self.block_count): # Encoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) rel_bias_enc = torch.zeros(n_rel_attn_bkts, n_head_enc, dtype=torch.float16) - yield f"enc.blk.{i}.attn_rel_b.weight", rel_bias_enc - + yield self.format_tensor_name(gguf.MODEL_TENSOR.ENC_ATTN_REL_B, i), rel_bias_enc + # Decoder relative attention bias - shape should be (n_rel_attn_bkts, n_head) rel_bias_dec = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) - yield f"dec.blk.{i}.attn_rel_b.weight", rel_bias_dec - + yield self.format_tensor_name(gguf.MODEL_TENSOR.DEC_ATTN_REL_B, i), rel_bias_dec + # Decoder cross attention relative bias - shape should be (n_rel_attn_bkts, n_head) rel_bias_cross = torch.zeros(n_rel_attn_bkts, n_head_dec, dtype=torch.float16) - yield f"dec.blk.{i}.cross_attn_rel_b.weight", rel_bias_cross + yield self.format_tensor_name(gguf.MODEL_TENSOR.DEC_CROSS_ATTN_REL_B, i), rel_bias_cross @ModelBase.register("T5EncoderModel") diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 42027ceba33..5a4e4ea9583 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -970,7 +970,7 @@ class TensorNameMap: "decoder.final_layer_norm", # t5 "model.decoder.norm", # t5gemma ), - + # T5GEMMA specific post layer normalization tensors MODEL_TENSOR.DEC_POST_SELF_ATTN_NORM: ( "model.decoder.layers.{bid}.post_self_attn_layernorm", # t5gemma From 062de3a3e7b45f0f8d9126a57ec9974acfb02023 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 6 Aug 2025 18:52:11 -0400 Subject: [PATCH 4/4] fix: resolve type annotation issues in T5Gemma model initialization --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3d86d9a7eae..2a786f3fc61 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6447,9 +6447,11 @@ class T5GemmaModel(TextModel): def __init__(self, *args, **kwargs): # Load hyperparameters first to modify them for super().__init__() - dir_model: Path = args[0] if args else kwargs.get('dir_model') + dir_model = args[0] if args else kwargs.get('dir_model') if dir_model is None: raise ValueError("dir_model is required") + # Type assertion after None check + dir_model = Path(dir_model) hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model) encoder_config = hparams.get("encoder", {})