diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d21edce16b71e..f3f0cc03092fb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1608,7 +1608,11 @@ def set_vocab(self): self._set_vocab_llama_hf() except (FileNotFoundError, TypeError): # Llama 3 - self._set_vocab_gpt2() + try: + self._set_vocab_gpt2() + except: + logger.warning('Will not set tokenizer for that model. For some models it might be okay, check for this one.') + self.gguf_writer.add_tokenizer_model("none") # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: @@ -1636,12 +1640,21 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "vocab_size" in hparams: + vocab_size = hparams["vocab_size"] + elif "text_vocab_size" in hparams: + vocab_size = hparams["text_vocab_size"] + else: + vocab_size = hparams["audio_vocab_size"] + self.gguf_writer.add_vocab_size(vocab_size) if "head_dim" in hparams: rope_dim = hparams["head_dim"] + elif "num_hidden_layers" in hparams: + rope_dim = hparams["num_hidden_layers"] else: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: @@ -1702,6 +1715,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] + if name.find("codebook0_head") or name.find("projection"): + return [(name, data_torch)] + return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: @@ -2327,6 +2343,39 @@ def set_gguf_parameters(self): self.gguf_writer.add_causal_attention(False) +@Model.register("MimiModel") +class MimiDec(Model): + model_arch = gguf.MODEL_ARCH.MIMI_DEC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + logger.debug(f"Processing tensor: {name}") + + if name.startswith("decoder.") \ + or name.startswith("decoder_transformer.") \ + or name.startswith("downsample.") \ + or name.startswith("encoder.") \ + or name.startswith("encoder_transformer.") \ + or name.startswith("upsample.") \ + or re.match(r"quantizer\..*_residual_vector_quantizer\..*", name): + logger.info(f"{name} -> {data_torch.shape}") + return [(name, data_torch)] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_vocab_size(self.hparams["codebook_size"]) + self.gguf_writer.add_group_norm_eps(self.hparams["norm_eps"]) + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 8909a65fd1e13..ebd55d9657b24 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -12,7 +12,7 @@ from safetensors.torch import save_file # default -model_path = './model.pt'; +model_path = './model.pt' # read from CLI if len(sys.argv) > 1: diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 4cc42e1674ccc..9090a91f98e9f 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -671,7 +671,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 { LOG_INF("%s: constructing prompt ..\n", __func__); - std::vector prompt_inp; + llama_tokens prompt_inp; prompt_init(prompt_inp, vocab); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cc48913d9789d..7def8d2f11f43 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -286,6 +286,7 @@ class MODEL_ARCH(IntEnum): GRANITE_MOE = auto() CHAMELEON = auto() WAVTOKENIZER_DEC = auto() + MIMI_DEC = auto() class MODEL_TENSOR(IntEnum): @@ -488,6 +489,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", + MODEL_ARCH.MIMI_DEC: "mimi-dec", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -626,7 +628,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", - MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output" } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1650,6 +1652,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], + MODEL_ARCH.MIMI_DEC: [ # TODO: check those again + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.CONV1D, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8d4a2b0320183..8d2bbe85398cf 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -29,6 +29,8 @@ class TensorNameMap: "shared", # t5 "rwkv.embeddings", # rwkv6 "model.embeddings", # rwkv7 + "text_embeddings", # csm + "audio_embeddings", # csm ), # Token type embeddings @@ -66,6 +68,7 @@ class TensorNameMap: "output_layer", # chatglm "head", # rwkv "head.out", # wavtokenizer + "audio_head", # csm ), # Output norm @@ -88,6 +91,7 @@ class TensorNameMap: "rwkv.ln_out", # rwkv6 "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer + "model.norm.scale", # csm ), # Rope frequencies @@ -129,6 +133,7 @@ class TensorNameMap: "transformer.layers.{bid}.attn_norm", # openelm "rwkv.blocks.{bid}.ln1", # rwkv6 "model.layers.{bid}.ln1", # rwkv7 + "model.layers.{bid}.sa_norm.scale" # csm ), # Attention norm 2 @@ -168,6 +173,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wq", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone + "model.layers.{bid}.attn.q_proj" # csm ), # Attention key @@ -182,6 +188,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wk", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone + "model.layers.{bid}.attn.k_proj" # csm ), # Attention value @@ -195,6 +202,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wv", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone + "model.layers.{bid}.attn.v_proj" # csm ), # Attention output @@ -221,6 +229,7 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.dense", # chatglm "transformer.layers.{bid}.attn.out_proj", # openelm "transformer.h.{bid}.attn.attention.out_proj", # exaone + "model.layers.{bid}.attn.output_proj" # csm ), # Attention output norm @@ -258,6 +267,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok "encoder.layers.{bid}.post_attention_layernorm", # chatglm "transformer.layers.{bid}.ffn_norm", # openelm + "model.layers.{bid}.mlp_norm.scale" # csm ), # Post feed-forward norm @@ -314,6 +324,7 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone + "model.layers.{bid}.mlp.w3", # csm ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -347,6 +358,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.mlp.w1" # csm ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -388,6 +400,7 @@ class TensorNameMap: "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm "model.layers.h.{bid}.mlp.c_proj", # exaone + "model.layers.{bid}.mlp.w2", # csm ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/split_hf.py b/split_hf.py new file mode 100644 index 0000000000000..9d3280e68454b --- /dev/null +++ b/split_hf.py @@ -0,0 +1,18 @@ +from safetensors import safe_open +from safetensors.torch import save_file + +safetensors_path = "my-models/csm/model.safetensors" + +# Open the original SafeTensors file +with safe_open(safetensors_path, framework="pt", device="cpu") as f: + tensors = {key: f.get_tensor(key) for key in f.keys()} + +# Identify tensors belonging to each model +backbone_tensors = {k.replace("backbone.", "model."): v for k, v in tensors.items() if any(x in k for x in ["backbone.", "text_"])} +decoder_tensors = {k.replace("decoder.", "model."): v for k, v in tensors.items() if any(x in k for x in ["decoder.", "audio_", "projection.", "codebook0_head."])} + +save_file(backbone_tensors, "backbone.safetensors") +print(f"Saved backbone model with {len(backbone_tensors)} tensors.") + +save_file(decoder_tensors, "decoder.safetensors") +print(f"Saved decoder model with {len(decoder_tensors)} tensors.")