From 915fa37d7023ee6472e8660935b42c37d1bb3a8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 7 May 2025 09:13:52 +0000 Subject: [PATCH 01/14] restructure HF mappings, simplify build shards loop --- eole/bin/convert/HF_mappings.py | 355 +++++++++++++++++++++ eole/bin/convert/convert_HF.py | 533 +++++--------------------------- 2 files changed, 432 insertions(+), 456 deletions(-) create mode 100644 eole/bin/convert/HF_mappings.py diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py new file mode 100644 index 00000000..e68bb709 --- /dev/null +++ b/eole/bin/convert/HF_mappings.py @@ -0,0 +1,355 @@ +from collections import defaultdict +from eole.config.models import ( + TransformerEncoderModelConfig, + TransformerModelConfig, + TransformerLMModelConfig, + VisionTransformerLMModelConfig, +) +from eole.config import recursive_update_dict + +from rich import print +from copy import deepcopy + +# Default tensor key mappings, based on Llama +BASE_KEY_MAP = { + "decoder_layer_prefix": "model.layers.", + # keys outside of encoder/decoder subsections will be centralized in the first shard + "tgt_emb.embeddings.weight": "model.embed_tokens.weight", + "decoder.layer_norm.weight": "model.norm.weight", + "generator.weight": "lm_head.weight", + "decoder": { + ".self_attn.linear_query.": ".self_attn.q_proj.", + ".self_attn.linear_keys.": ".self_attn.k_proj.", + ".self_attn.linear_values.": ".self_attn.v_proj.", + ".self_attn.final_linear.": ".self_attn.o_proj.", + ".mlp.gate_up_proj.": ".mlp.gate_proj.", + ".mlp.down_proj.": ".mlp.down_proj.", + ".mlp.up_proj.": ".mlp.up_proj.", + ".input_layernorm.": ".input_layernorm.", + ".post_attention_layernorm.": ".post_attention_layernorm.", + }, +} + + +# Model-specific overrides for key mappings and configurations +# root keys are weights to be added in the first shard +# encoder/decoder sections are modules of each encoder/decoder layer +MODEL_OVERRIDES = { + "LlamaForCausalLM": {}, # default + "MistralForCausalLM": {}, + "Qwen2ForCausalLM": { + "config": { + "add_qkvbias": True, + "add_final_linear_bias": False, + } + }, + "Gemma2ForCausalLM": { + "decoder": { + ".pre_feedforward_layernorm.": ".pre_feedforward_layernorm.", + ".post_feedforward_layernorm.": ".post_feedforward_layernorm.", + }, + "config": { + "share_decoder_embeddings": True, + "ffn_layernorm": True, + "embeddings": { + "normalize": True, + }, + }, + }, + "MixtralForCausalLM": { + "decoder": { + ".mlp.gate.": ".block_sparse_moe.gate.", + **{ + f".mlp.experts.{i}.{attr}": f".block_sparse_moe.experts.{i}.w{j}." + for i in range(8) + for j, attr in enumerate(["gate_up_proj.", "down_proj.", "up_proj."]) + }, + **{f".mlp.experts.{i}.layer_norm.weight": ".post_attention_layernorm.weight" for i in range(8)}, + } + }, + "PhiForCausalLM": { + "decoder.layer_norm.": "model.final_layernorm.", + "decoder.layer_norm.bias": "model.final_layernorm.bias", + "generator.bias": "lm_head.bias", + "decoder": { + ".self_attn.final_linear.": ".self_attn.dense.", + ".mlp.gate_up_proj.": ".mlp.fc1.", + ".mlp.down_proj.": ".mlp.fc2.", + ".input_layernorm.": (".input_layernorm.", ""), + }, + "config": { + "parallel_residual": True, + "shared_layer_norm": True, + "add_qkvbias": True, + "add_final_linear_bias": True, + "add_ffnbias": True, + }, + }, + "Phi3ForCausalLM": { + "decoder": { + ".self_attn.linear_query.": (".self_attn.qkv_proj.", "[:hidden_size, :]"), + ".self_attn.linear_keys.": ( + ".self_attn.qkv_proj.", + "[hidden_size:2*hidden_size, :]", + ), + ".self_attn.linear_values.": (".self_attn.qkv_proj.", "[-hidden_size:, :]"), + ".mlp.gate_up_proj.": (".mlp.gate_up_proj.", "[:transformer_ff, :]"), + ".mlp.up_proj.": (".mlp.gate_up_proj.", "[transformer_ff:, :]"), + } + }, + "GPT2LMHeadModel": { + "decoder_layer_prefix": "h.", + "tgt_emb.pe.weight": "wpe.weight", + "decoder": { + ".self_attn.linear_query.": (".attn.c_attn.", ".t()[:hidden_size, ...]"), + ".self_attn.linear_keys.": ( + ".attn.c_attn.", + ".t()[hidden_size:2*hidden_size, ...]", + ), + ".self_attn.linear_values.": (".attn.c_attn.", ".t()[-hidden_size:, ...]"), + ".self_attn.final_linear.": (".attn.c_proj.", ".t()"), + ".mlp.gate_up_proj.": (".mlp.c_fc.", ".t()"), + ".mlp.down_proj.": (".mlp.c_proj.", ".t()"), + ".input_layernorm.": ".ln_1.", + ".input_layernorm.": ".ln_1.", + ".post_attention_layernorm.": ".ln_2.", + ".post_attention_layernorm.": ".ln_2.", + "decoder.layer_norm.": "ln_f.", + "decoder.layer_norm.": "ln_f.", + }, + "config": { + "parallel_residual": False, + "shared_layer_norm": True, + "add_qkvbias": True, + "add_final_linear_bias": True, + "add_ffnbias": True, + "embeddings": { + "position_encoding_type": "Learned", + "n_positions": 1024, + }, + "left_pad": False, + }, + }, + "XLMRobertaXLForMaskedLM": { + "encoder_layer_prefix": "roberta.encoder.layer.", + "encoder.layer_norm.weight": "roberta.encoder.LayerNorm.weight", + "encoder.layer_norm.bias": "roberta.encoder.LayerNorm.bias", + "src_emb.embeddings.weight": "roberta.embeddings.word_embeddings.weight", + "src_emb.pe.weight": "roberta.embeddings.position_embeddings.weight", + "decoder": { + ".self_attn.linear_query.": ".attention.self.query.", + ".self_attn.linear_keys.": ".attention.self.key.", + ".self_attn.linear_values.": ".attention.self.value.", + ".self_attn.final_linear.": ".attention.output.dense.", + ".mlp.gate_up_proj.": ".intermediate.dense.", + ".mlp.down_proj.": ".output.dense.", + ".input_layernorm.": ".attention.self_attn_layer_norm.", + ".post_attention_layernorm.": ".LayerNorm.", + }, + "config": { + "add_qkvbias": True, + "add_final_linear_bias": True, + "add_ffnbias": True, + "embeddings": { + "position_encoding_type": "Learned", + "n_positions": 514, + "position_shift": 2, + }, + "left_pad": False, + }, + }, + "LlavaForConditionalGeneration": { + "decoder_layer_prefix": "language_model.model.layers.", + "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", + "decoder.layer_norm.weight": "language_model.model.norm.weight", + "generator.weight": "language_model.lm_head.weight", + "encoder.patch_conv.weight": "vision_tower.patch_conv.weight", + "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", + "encoder_layer_prefix": "vision_tower.transformer.layers.", + "encoder": { + "layers": 24, + ".self_attn.linear_query.": ".attention.q_proj.", + ".self_attn.linear_keys.": ".attention.k_proj.", + ".self_attn.linear_values.": ".attention.v_proj.", + ".self_attn.final_linear.": ".attention.o_proj.", + ".mlp.gate_up_proj.": ".feed_forward.gate_proj.", + ".mlp.down_proj.": ".feed_forward.down_proj.", + ".mlp.up_proj.": ".feed_forward.up_proj.", + ".input_layernorm.": ".attention_norm.", # not sure about this one + ".post_attention_layernorm.": ".ffn_norm.", + }, + # vision_adapter + "adapter.w_in.weight": "multi_modal_projector.linear_1.weight", + "adapter.w_in.bias": "multi_modal_projector.linear_1.bias", + "adapter.w_out.weight": "multi_modal_projector.linear_2.weight", + "adapter.w_out.bias": "multi_modal_projector.linear_2.bias", + }, + "Mistral3ForConditionalGeneration": { + "decoder_layer_prefix": "language_model.model.layers.", + "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", + "decoder.layer_norm.weight": "language_model.model.norm.weight", + "generator.weight": "language_model.lm_head.weight", + "encoder.patch_conv.weight": "vision_tower.patch_conv.weight", + "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", + # vision_tower + "encoder_layer_prefix": "vision_tower.transformer.layers.", + "encoder": { + "layers": 24, + ".self_attn.linear_query.": ".attention.q_proj.", + ".self_attn.linear_keys.": ".attention.k_proj.", + ".self_attn.linear_values.": ".attention.v_proj.", + ".self_attn.final_linear.": ".attention.o_proj.", + ".mlp.gate_up_proj.": ".feed_forward.gate_proj.", + ".mlp.down_proj.": ".feed_forward.down_proj.", + ".mlp.up_proj.": ".feed_forward.up_proj.", + ".input_layernorm.": ".attention_norm.", # not sure about this one + ".post_attention_layernorm.": ".ffn_norm.", + }, + # vision_adapter + "adapter.w_in.weight": "multi_modal_projector.linear_1.weight", + "adapter.w_out.weight": "multi_modal_projector.linear_2.weight", + "adapter.layernorm.weight": "multi_modal_projector.norm.weight", + "adapter.patch_merger.merging_layer.weight": "multi_modal_projector.patch_merger.merging_layer.weight", + }, + "Gemma3ForConditionalGeneration": { + "decoder_layer_prefix": "language_model.model.layers.", + "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", + "decoder.layer_norm.weight": "language_model.model.norm.weight", + # "generator.weight": "language_model.lm_head.weight", # probably shared with embeddings + # decoder layer modules + "decoder": { + ".self_attn.q_norm.": ".self_attn.q_norm.", + ".self_attn.k_norm.": ".self_attn.k_norm.", + ".pre_feedforward_layernorm.": ".pre_feedforward_layernorm.", + ".post_feedforward_layernorm.": ".post_feedforward_layernorm.", + }, + "encoder_layer_prefix": "vision_tower.vision_model.encoder.layers.", + "encoder.patch_conv.weight": "vision_tower.vision_model.embeddings.patch_embedding.weight", + "encoder.patch_conv.bias": "vision_tower.vision_model.embeddings.patch_embedding.bias", + "encoder.post_layernorm.weight": "vision_tower.vision_model.post_layernorm.weight", + "encoder.post_layernorm.bias": "vision_tower.vision_model.post_layernorm.bias", + "encoder.position_embeddings.weight": "vision_tower.vision_model.embeddings.position_embedding.weight", + # "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", # no ln_pre in Gemma3 + # encoder layers modules + "encoder": { + ".self_attn.linear_query.": ".self_attn.q_proj.", + ".self_attn.linear_keys.": ".self_attn.k_proj.", + ".self_attn.linear_values.": ".self_attn.v_proj.", + ".self_attn.final_linear.": ".self_attn.out_proj.", + ".mlp.gate_up_proj.": ".mlp.fc1.", + ".mlp.down_proj.": ".mlp.fc2.", + ".input_layernorm.": ".layer_norm1.", + ".post_attention_layernorm.": ".layer_norm2.", + }, + "adapter.w_in.weight": ("multi_modal_projector.mm_input_projection_weight", ".t()"), + "adapter.norm.weight": "multi_modal_projector.mm_soft_emb_norm.weight", + "config": { + "share_decoder_embeddings": True, + "ffn_layernorm": True, + "embeddings": { + "normalize": True, + }, + }, + }, + "M2M100ForConditionalGeneration": { + "decoder_layer_prefix": "model.decoder.layers.", + "src_emb.embeddings.weight": "model.encoder.embed_tokens.weight", + "tgt_emb.embeddings.weight": "model.decoder.embed_tokens.weight", + "decoder.layer_norm.weight": "model.decoder.layer_norm.weight", + "decoder.layer_norm.bias": "model.decoder.layer_norm.bias", + "decoder": { + ".self_attn.linear_query.": ".self_attn.q_proj.", + ".self_attn.linear_keys.": ".self_attn.k_proj.", + ".self_attn.linear_values.": ".self_attn.v_proj.", + ".self_attn.final_linear.": ".self_attn.out_proj.", + ".precontext_layernorm.": ".encoder_attn_layer_norm.", + ".context_attn.linear_query.": ".encoder_attn.q_proj.", + ".context_attn.linear_keys.": ".encoder_attn.k_proj.", + ".context_attn.linear_values.": ".encoder_attn.v_proj.", + ".context_attn.final_linear.": ".encoder_attn.out_proj.", + ".mlp.gate_up_proj.": ".fc1.", + ".mlp.down_proj.": ".fc2.", + ".input_layernorm.": ".self_attn_layer_norm.", + ".post_attention_layernorm.": ".final_layer_norm.", + }, + "encoder_layer_prefix": "model.encoder.layers.", + "encoder.layer_norm.": "model.encoder.layer_norm.", + "encoder": { + ".self_attn.linear_query.": ".self_attn.q_proj.", + ".self_attn.linear_keys.": ".self_attn.k_proj.", + ".self_attn.linear_values.": ".self_attn.v_proj.", + ".self_attn.final_linear.": ".self_attn.out_proj.", + ".mlp.gate_up_proj.": ".fc1.", + ".mlp.down_proj.": ".fc2.", + ".input_layernorm.": ".self_attn_layer_norm.", + ".post_attention_layernorm.": ".final_layer_norm.", + }, + "config": { + "parallel_residual": False, + "add_qkvbias": True, + "add_final_linear_bias": True, + "add_ffnbias": True, + "embeddings": { + "position_encoding_type": "SinusoidalConcat", + "n_positions": 1024, + }, + "left_pad": False, + "share_decoder_embeddings": True, + }, + }, +} + +# Combine base mappings with overrides +# KEY_MAPS = {model: {**BASE_KEY_MAP, **overrides} for model, overrides in MODEL_OVERRIDES.items()} +KEY_MAPS = { + model: recursive_update_dict(deepcopy(BASE_KEY_MAP), overrides, {}) for model, overrides in MODEL_OVERRIDES.items() +} + +# Layer norm type +LN_TABLE = defaultdict( + lambda: "rms", + { + "PhiForCausalLM": "standard", + "GPT2LMHeadModel": "standard", + "XLMRobertaXLForMaskedLM": "standard", + "Gemma2ForCausalLM": "gemma-rms", + "M2M100ForConditionalGeneration": "standard", + "Gemma3ForConditionalGeneration": "gemma-rms", + }, +) + +# Activation type (gated-silu also enables the ffn gate) +ACT_TABLE = defaultdict( + lambda: "gated-silu", + { + "PhiForCausalLM": "gelu", + "GPT2LMHeadModel": "gelu", + "XLMRobertaXLForMaskedLM": "gelu", + "Gemma2ForCausalLM": "gated-gelu", + "Gemma3ForConditionalGeneration": "gated-gelu-tanh", + "M2M100ForConditionalGeneration": "relu", + }, +) + +# Not used anymore since Gemma3 PR, not sure if needed +# VISION_ACT_TABLE = defaultdict( +# lambda: "gated-silu", +# { +# "Mistral3ForConditionalGeneration": "gated-gelu", +# }, +# ) + +# Eole config class +ARCH_TABLE = defaultdict( + lambda: TransformerLMModelConfig, + { + "XLMRobertaXLForMaskedLM": TransformerEncoderModelConfig, + "LlavaForConditionalGeneration": VisionTransformerLMModelConfig, + "Mistral3ForConditionalGeneration": VisionTransformerLMModelConfig, + "Gemma3ForConditionalGeneration": VisionTransformerLMModelConfig, + "M2M100ForConditionalGeneration": TransformerModelConfig, + }, +) + +# Default tokenization transform +TOK_TABLE = defaultdict(lambda: "huggingface_tokenize") diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 8e2285b6..82ccb15a 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -33,273 +33,14 @@ from eole.constants import DefaultTokens, TORCH_DTYPES, PositionEncodingType from eole.inputters.inputter import vocabs_to_dict - -# Default tensor key mappings, based on Llama -BASE_KEY_MAP = { - "decoder_layer_prefix": "model.layers.", - "tgt_emb.embeddings.weight": "model.embed_tokens.weight", - "decoder.layer_norm.weight": "model.norm.weight", - "generator.weight": "lm_head.weight", - ".self_attn.linear_query.": ".self_attn.q_proj.", - ".self_attn.linear_keys.": ".self_attn.k_proj.", - ".self_attn.linear_values.": ".self_attn.v_proj.", - ".self_attn.final_linear.": ".self_attn.o_proj.", - ".mlp.gate_up_proj.": ".mlp.gate_proj.", - ".mlp.down_proj.": ".mlp.down_proj.", - ".mlp.up_proj.": ".mlp.up_proj.", - ".input_layernorm.weight": ".input_layernorm.weight", - ".post_attention_layernorm.weight": ".post_attention_layernorm.weight", -} - - -MODEL_OVERRIDES = { - "LlamaForCausalLM": {}, # default - "MistralForCausalLM": {}, - "Qwen2ForCausalLM": {}, - "Gemma2ForCausalLM": { - ".pre_feedforward_layernorm.weight": ".pre_feedforward_layernorm.weight", - ".post_feedforward_layernorm.weight": ".post_feedforward_layernorm.weight", - }, - "MixtralForCausalLM": { - ".mlp.gate.weight": ".block_sparse_moe.gate.weight", - **{ - f".mlp.experts.{i}.{attr}": f".block_sparse_moe.experts.{i}.w{j}." - for i in range(8) - for j, attr in enumerate(["gate_up_proj.", "down_proj.", "up_proj."]) - }, - **{f".mlp.experts.{i}.layer_norm.weight": ".post_attention_layernorm.weight" for i in range(8)}, - }, - "PhiForCausalLM": { - "decoder.layer_norm.weight": "model.final_layernorm.weight", - "decoder.layer_norm.bias": "model.final_layernorm.bias", - "generator.bias": "lm_head.bias", - ".self_attn.final_linear.": ".self_attn.dense.", - ".mlp.gate_up_proj.": ".mlp.fc1.", - ".mlp.down_proj.": ".mlp.fc2.", - ".input_layernorm.bias": (".input_layernorm.bias", ""), - }, - "Phi3ForCausalLM": { - ".self_attn.linear_query.": (".self_attn.qkv_proj.", "[:hidden_size, :]"), - ".self_attn.linear_keys.": ( - ".self_attn.qkv_proj.", - "[hidden_size:2*hidden_size, :]", - ), - ".self_attn.linear_values.": (".self_attn.qkv_proj.", "[-hidden_size:, :]"), - ".mlp.gate_up_proj.": (".mlp.gate_up_proj.", "[:transformer_ff, :]"), - ".mlp.up_proj.": (".mlp.gate_up_proj.", "[transformer_ff:, :]"), - }, - "GPT2LMHeadModel": { - "decoder_layer_prefix": "h.", - "tgt_emb.pe.weight": "wpe.weight", - ".self_attn.linear_query.": (".attn.c_attn.", ".t()[:hidden_size, ...]"), - ".self_attn.linear_keys.": ( - ".attn.c_attn.", - ".t()[hidden_size:2*hidden_size, ...]", - ), - ".self_attn.linear_values.": (".attn.c_attn.", ".t()[-hidden_size:, ...]"), - ".self_attn.final_linear.": (".attn.c_proj.", ".t()"), - ".mlp.gate_up_proj.": (".mlp.c_fc.", ".t()"), - ".mlp.down_proj.": (".mlp.c_proj.", ".t()"), - ".input_layernorm.weight": ".ln_1.weight", - ".input_layernorm.bias": ".ln_1.bias", - ".post_attention_layernorm.weight": ".ln_2.weight", - ".post_attention_layernorm.bias": ".ln_2.bias", - "decoder.layer_norm.weight": "ln_f.weight", - "decoder.layer_norm.bias": "ln_f.bias", - }, - "XLMRobertaXLForMaskedLM": { - "encoder_layer_prefix": "roberta.encoder.layer.", - "src_emb.embeddings.weight": "roberta.embeddings.word_embeddings.weight", - "src_emb.pe.weight": "roberta.embeddings.position_embeddings.weight", - ".self_attn.linear_query.": ".attention.self.query.", - ".self_attn.linear_keys.": ".attention.self.key.", - ".self_attn.linear_values.": ".attention.self.value.", - ".self_attn.final_linear.": ".attention.output.dense.", - ".mlp.gate_up_proj.": ".intermediate.dense.", - ".mlp.down_proj.": ".output.dense.", - ".input_layernorm.weight": ".attention.self_attn_layer_norm.weight", - ".input_layernorm.bias": ".attention.self_attn_layer_norm.bias", - ".post_attention_layernorm.weight": ".LayerNorm.weight", - ".post_attention_layernorm.bias": ".LayerNorm.bias", - "encoder.layer_norm.weight": "roberta.encoder.LayerNorm.weight", - "encoder.layer_norm.bias": "roberta.encoder.LayerNorm.bias", - }, - "LlavaForConditionalGeneration": { - "decoder_layer_prefix": "language_model.model.layers.", - "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", - "decoder.layer_norm.weight": "language_model.model.norm.weight", - "generator.weight": "language_model.lm_head.weight", - "encoder.patch_conv.weight": "vision_tower.patch_conv.weight", - "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", - # vision_tower - "encoder_layer_prefix": "vision_tower.transformer.layers.", - "encoder": { - "layers": 24, - ".self_attn.linear_query.": ".attention.q_proj.", - ".self_attn.linear_keys.": ".attention.k_proj.", - ".self_attn.linear_values.": ".attention.v_proj.", - ".self_attn.final_linear.": ".attention.o_proj.", - ".mlp.gate_up_proj.": ".feed_forward.gate_proj.", - ".mlp.down_proj.": ".feed_forward.down_proj.", - ".mlp.up_proj.": ".feed_forward.up_proj.", - ".input_layernorm.weight": ".attention_norm.weight", # not sure about this one - ".post_attention_layernorm.weight": ".ffn_norm.weight", - }, - # vision_adapter - "adapter.w_in.weight": "multi_modal_projector.linear_1.weight", - "adapter.w_in.bias": "multi_modal_projector.linear_1.bias", - "adapter.w_out.weight": "multi_modal_projector.linear_2.weight", - "adapter.w_out.bias": "multi_modal_projector.linear_2.bias", - }, - "Mistral3ForConditionalGeneration": { - "decoder_layer_prefix": "language_model.model.layers.", - "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", - "decoder.layer_norm.weight": "language_model.model.norm.weight", - "generator.weight": "language_model.lm_head.weight", - "encoder.patch_conv.weight": "vision_tower.patch_conv.weight", - "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", - # vision_tower - "encoder_layer_prefix": "vision_tower.transformer.layers.", - "encoder": { - "layers": 24, - ".self_attn.linear_query.": ".attention.q_proj.", - ".self_attn.linear_keys.": ".attention.k_proj.", - ".self_attn.linear_values.": ".attention.v_proj.", - ".self_attn.final_linear.": ".attention.o_proj.", - ".mlp.gate_up_proj.": ".feed_forward.gate_proj.", - ".mlp.down_proj.": ".feed_forward.down_proj.", - ".mlp.up_proj.": ".feed_forward.up_proj.", - ".input_layernorm.weight": ".attention_norm.weight", # not sure about this one - ".post_attention_layernorm.weight": ".ffn_norm.weight", - }, - # vision_adapter - "adapter.w_in.weight": "multi_modal_projector.linear_1.weight", - "adapter.w_out.weight": "multi_modal_projector.linear_2.weight", - "adapter.layernorm.weight": "multi_modal_projector.norm.weight", - "adapter.patch_merger.merging_layer.weight": "multi_modal_projector.patch_merger.merging_layer.weight", - }, - "Gemma3ForConditionalGeneration": { - "decoder_layer_prefix": "language_model.model.layers.", - "tgt_emb.embeddings.weight": "language_model.model.embed_tokens.weight", - "decoder.layer_norm.weight": "language_model.model.norm.weight", - # "generator.weight": "language_model.lm_head.weight", # probably shared with embeddings - "encoder.patch_conv.weight": "vision_tower.vision_model.embeddings.patch_embedding.weight", - "encoder.patch_conv.bias": "vision_tower.vision_model.embeddings.patch_embedding.bias", - "encoder.post_layernorm.weight": "vision_tower.vision_model.post_layernorm.weight", - "encoder.post_layernorm.bias": "vision_tower.vision_model.post_layernorm.bias", - "encoder.position_embeddings.weight": "vision_tower.vision_model.embeddings.position_embedding.weight", - # "encoder.ln_pre.weight": "vision_tower.ln_pre.weight", # no ln_pre in Gemma3 - "encoder_layer_prefix": "vision_tower.vision_model.encoder.layers.", - ".self_attn.q_norm.": ".self_attn.q_norm.", - ".self_attn.k_norm.": ".self_attn.k_norm.", - ".pre_feedforward_layernorm.weight": ".pre_feedforward_layernorm.weight", - ".post_feedforward_layernorm.weight": ".post_feedforward_layernorm.weight", - "encoder": { - ".self_attn.linear_query.": ".self_attn.q_proj.", - ".self_attn.linear_keys.": ".self_attn.k_proj.", - ".self_attn.linear_values.": ".self_attn.v_proj.", - ".self_attn.final_linear.": ".self_attn.out_proj.", - ".mlp.gate_up_proj.": ".mlp.fc1.", - ".mlp.down_proj.": ".mlp.fc2.", - ".input_layernorm.weight": ".layer_norm1.weight", - ".input_layernorm.bias": ".layer_norm1.bias", - ".post_attention_layernorm.weight": ".layer_norm2.weight", - ".post_attention_layernorm.bias": ".layer_norm2.bias", - }, - # TODO: not the same adapter as llava - "adapter.w_in.weight": ("multi_modal_projector.mm_input_projection_weight", ".t()"), - "adapter.norm.weight": "multi_modal_projector.mm_soft_emb_norm.weight", - }, - "M2M100ForConditionalGeneration": { - "encoder_layer_prefix": "model.encoder.layers.", - "decoder_layer_prefix": "model.decoder.layers.", - "src_emb.embeddings.weight": "model.encoder.embed_tokens.weight", - "tgt_emb.embeddings.weight": "model.decoder.embed_tokens.weight", - "decoder.layer_norm.weight": "model.decoder.layer_norm.weight", - "decoder.layer_norm.bias": "model.decoder.layer_norm.bias", - "encoder.layer_norm.weight": "model.encoder.layer_norm.weight", - "encoder.layer_norm.bias": "model.encoder.layer_norm.bias", - ".self_attn.linear_query.": ".self_attn.q_proj.", - ".self_attn.linear_keys.": ".self_attn.k_proj.", - ".self_attn.linear_values.": ".self_attn.v_proj.", - ".self_attn.final_linear.": ".self_attn.out_proj.", - ".precontext_layernorm.weight": ".encoder_attn_layer_norm.weight", - ".precontext_layernorm.bias": ".encoder_attn_layer_norm.bias", - ".context_attn.linear_query.": ".encoder_attn.q_proj.", - ".context_attn.linear_keys.": ".encoder_attn.k_proj.", - ".context_attn.linear_values.": ".encoder_attn.v_proj.", - ".context_attn.final_linear.": ".encoder_attn.out_proj.", - ".mlp.gate_up_proj.": ".fc1.", - ".mlp.down_proj.": ".fc2.", - ".input_layernorm.weight": ".self_attn_layer_norm.weight", - ".input_layernorm.bias": ".self_attn_layer_norm.bias", - ".post_attention_layernorm.weight": ".final_layer_norm.weight", - ".post_attention_layernorm.bias": ".final_layer_norm.bias", - "encoder": { - ".self_attn.linear_query.": ".self_attn.q_proj.", - ".self_attn.linear_keys.": ".self_attn.k_proj.", - ".self_attn.linear_values.": ".self_attn.v_proj.", - ".self_attn.final_linear.": ".self_attn.out_proj.", - ".mlp.gate_up_proj.": ".fc1.", - ".mlp.down_proj.": ".fc2.", - ".input_layernorm.weight": ".self_attn_layer_norm.weight", - ".input_layernorm.bias": ".self_attn_layer_norm.bias", - ".post_attention_layernorm.weight": ".final_layer_norm.weight", - ".post_attention_layernorm.bias": ".final_layer_norm.bias", - }, - }, -} - -# Combine base mappings with overrides -KEY_MAPS = {model: {**BASE_KEY_MAP, **overrides} for model, overrides in MODEL_OVERRIDES.items()} - -# Layer norm type -LN_TABLE = defaultdict( - lambda: "rms", - { - "PhiForCausalLM": "standard", - "GPT2LMHeadModel": "standard", - "XLMRobertaXLForMaskedLM": "standard", - "Gemma2ForCausalLM": "gemma-rms", - "M2M100ForConditionalGeneration": "standard", - "Gemma3ForConditionalGeneration": "gemma-rms", - }, +from eole.bin.convert.HF_mappings import ( + KEY_MAPS, + ACT_TABLE, + LN_TABLE, + ARCH_TABLE, + TOK_TABLE, ) -# Activation type (gated-silu also enables the ffn gate) -ACT_TABLE = defaultdict( - lambda: "gated-silu", - { - "PhiForCausalLM": "gelu", - "GPT2LMHeadModel": "gelu", - "XLMRobertaXLForMaskedLM": "gelu", - "Gemma2ForCausalLM": "gated-gelu", - "Gemma3ForConditionalGeneration": "gated-gelu-tanh", - "M2M100ForConditionalGeneration": "relu", - }, -) -VISION_ACT_TABLE = defaultdict( - lambda: "gated-silu", - { - "Mistral3ForConditionalGeneration": "gated-gelu", - }, -) - -# Eole config class -ARCH_TABLE = defaultdict( - lambda: TransformerLMModelConfig, - { - "XLMRobertaXLForMaskedLM": TransformerEncoderModelConfig, - "LlavaForConditionalGeneration": VisionTransformerLMModelConfig, - "Mistral3ForConditionalGeneration": VisionTransformerLMModelConfig, - "Gemma3ForConditionalGeneration": VisionTransformerLMModelConfig, - "M2M100ForConditionalGeneration": TransformerModelConfig, - }, -) - -# Default tokenization transform -TOK_TABLE = defaultdict(lambda: "huggingface_tokenize") - def get_sentencepiece_vocab(model_path): """Get the vocabulary from a SentencePiece model. @@ -747,7 +488,7 @@ def build_config_dict(hf): "linear_keys", "final_linear", ] - params = ["qweight", "qzeros", "scales"] + params = ["qweight", "qzeros", "scales"] + ["weight", "bias"] # some params are not quantized else: training_config["quant_type"] = "" training_config["w_bit"] = 0 @@ -765,73 +506,9 @@ def build_config_dict(hf): } ) - # Define architecture-specific configurations - arch_configs = { - "PhiForCausalLM": { - "parallel_residual": True, - "shared_layer_norm": True, - "add_qkvbias": True, - "add_final_linear_bias": True, - "add_ffnbias": True, - }, - "GPT2LMHeadModel": { - "parallel_residual": False, - "shared_layer_norm": True, - "add_qkvbias": True, - "add_final_linear_bias": True, - "add_ffnbias": True, - "embeddings": { - "position_encoding_type": "Learned", - "n_positions": 1024, - }, - "left_pad": False, - }, - "XLMRobertaXLForMaskedLM": { - "add_qkvbias": True, - "add_final_linear_bias": True, - "add_ffnbias": True, - "embeddings": { - "position_encoding_type": "Learned", - "n_positions": 514, - "position_shift": 2, - }, - "left_pad": False, - }, - "Qwen2ForCausalLM": { - "add_qkvbias": True, - "add_final_linear_bias": False, - }, - "Gemma2ForCausalLM": { - "share_decoder_embeddings": True, - "ffn_layernorm": True, - "embeddings": { - "normalize": True, - }, - }, - "Gemma3ForConditionalGeneration": { - "share_decoder_embeddings": True, - "ffn_layernorm": True, - "embeddings": { - "normalize": True, - }, - }, - "M2M100ForConditionalGeneration": { - "parallel_residual": False, - "add_qkvbias": True, - "add_final_linear_bias": True, - "add_ffnbias": True, - "embeddings": { - "position_encoding_type": "SinusoidalConcat", - "n_positions": 1024, - }, - "left_pad": False, - "share_decoder_embeddings": True, - }, - } - # Update model_config based on architecture - if arch in arch_configs: - for key, value in arch_configs[arch].items(): + if arch in KEY_MAPS: + for key, value in KEY_MAPS[arch].get("config", {}).items(): if isinstance(value, dict): # Update nested dictionaries model_config[key] = {**model_config.get(key, {}), **value} @@ -937,7 +614,7 @@ def get_shards_map(model_config, hf, nshards): weightmap = hf.wmap["weight_map"] # Initialize a list of checkpoint lists, one for each shard - ckpt_lists = [set() for _ in range(nshards)] + shard_checkpoints = [set() for _ in range(nshards)] # Check if a layer key belongs to the current shard def is_layer_in_range(key, prefix, layer_range): @@ -949,9 +626,9 @@ def is_layer_in_range(key, prefix, layer_range): if is_layer_in_range(key, hf.decoder_layer_prefix, layer_range) or is_layer_in_range( key, hf.encoder_layer_prefix, layer_range ): - ckpt_lists[shard].add(ckpt) + shard_checkpoints[shard].add(ckpt) - return ckpt_lists, shard_layer_ranges + return shard_checkpoints, shard_layer_ranges def build_shards(model_config, hf, args, params): @@ -968,154 +645,98 @@ def build_shards(model_config, hf, args, params): Layer parameters are distributed across shards based on the sharding configuration. The first shard contains embeddings and model-level parameters on top of its layer split. """ - ckpt_lists, shard_layer_ranges = get_shards_map(model_config, hf, args.nshards) + shard_checkpoints, shard_layer_ranges = get_shards_map(model_config, hf, args.nshards) + # build N shards for shard in range(args.nshards): print("starting output shard: %d/%d" % (shard + 1, args.nshards)) eole_safetensor = {} - first_shard_targets = [ - "tgt_emb.embeddings.weight", - "tgt_emb.pe.weight", - "decoder.layer_norm.weight", - "decoder.layer_norm.bias", - "src_emb.embeddings.weight", - "src_emb.pe.weight", - "encoder.layer_norm.weight", - "encoder.layer_norm.bias", - "generator.weight", - "generator.bias", - "encoder.patch_conv.weight", - "encoder.patch_conv.bias", - "encoder.ln_pre.weight", - "adapter.w_in.weight", - "adapter.w_in.bias", - "adapter.w_out.weight", - "adapter.w_out.bias", - "adapter.layernorm.weight", - "adapter.patch_merger.merging_layer.weight", - "adapter.norm.weight", - "encoder.position_embeddings.weight", - "encoder.post_layernorm.weight", - "encoder.post_layernorm.bias", - ] def build_first_shard(hf, eole_safetensor): - if model_config["share_decoder_embeddings"]: - first_shard_targets.remove("generator.weight") - for target in first_shard_targets: - if target in KEY_MAPS[hf.arch].keys(): - source = KEY_MAPS[hf.arch][target] - srckey, srcmap = source if isinstance(source, tuple) else (source, None) + for target in KEY_MAPS[hf.arch].keys(): + if model_config["share_decoder_embeddings"] and target == "generator.weight": + continue + source = KEY_MAPS[hf.arch][target] + srckey, srcmap = source if isinstance(source, tuple) else (source, None) + if isinstance(srckey, str): if hf.wmap_path: - checkpoint = hf.get_load_ckpt( - hf.base_dir, - hf.wmap["weight_map"][srckey], - ) + if srckey in hf.wmap["weight_map"]: + checkpoint = hf.get_load_ckpt( + hf.base_dir, + hf.wmap["weight_map"][srckey], + ) + else: + checkpoint = None else: checkpoint = hf.get_load_ckpt(*os.path.split(hf.model_path)) - w = get_weight(checkpoint, srckey) - if w is not None: - if srcmap is not None: - w = eval( - "w" + srcmap, - { - "w": w, - "hidden_size": model_config["hidden_size"], - "transformer_ff": model_config["transformer_ff"], - }, - ).contiguous() - eole_safetensor[target] = w - - if target == "generator.bias": - model_config["generator_bias"] = True - if target == "adapter.w_in.bias": - model_config["adapter_bias"] = True + else: + checkpoint = None + if checkpoint is None: + continue + w = get_weight(checkpoint, srckey) + if w is not None: + if srcmap is not None: + w = eval( + "w" + srcmap, + { + "w": w, + "hidden_size": model_config["hidden_size"], + "transformer_ff": model_config["transformer_ff"], + }, + ).contiguous() + eole_safetensor[target] = w + + # patch config depending on bias presence + if target == "generator.bias": + model_config["generator_bias"] = True + if target == "adapter.w_in.bias": + model_config["adapter_bias"] = True return eole_safetensor if shard == 0: eole_safetensor = build_first_shard(hf, eole_safetensor) - for ckpt in ckpt_lists[shard]: + # TODO: could we reverse the mapping and loop on params instead? (would reduce conditions) + for ckpt in shard_checkpoints[shard]: print("Loading %s" % ckpt) checkpoint = hf.checkpoint(ckpt) for i in shard_layer_ranges[shard]: prefix_mapping = ( - (hf.encoder_layer_prefix, "encoder.transformer_layers."), - (hf.decoder_layer_prefix, "decoder.transformer_layers."), + ("encoder", hf.encoder_layer_prefix, "encoder.transformer_layers."), + ("decoder", hf.decoder_layer_prefix, "decoder.transformer_layers."), ) - for hf_prefix, eole_prefix in prefix_mapping: + for section, hf_prefix, eole_prefix in prefix_mapping: if hf_prefix is None: continue - for param in params: - # TODO: factorize this better - for key_map in [KEY_MAPS[hf.arch], KEY_MAPS[hf.arch].get("encoder", {})]: - for target, source in key_map.items(): - # TODO: this should be cleaned up when rationalizing encoder/decoder mappings - if not (isinstance(source, str) or isinstance(source, tuple)): - continue - if target in first_shard_targets: - continue - srckey, srcmap = source if isinstance(source, tuple) else (source, None) - w = get_weight( - checkpoint, - hf_prefix + str(i) + srckey + param, - ) - - if w is not None: - if srcmap is not None: - w = eval( - "w" + srcmap, - { - "w": w, - "hidden_size": model_config["hidden_size"], - "transformer_ff": model_config["transformer_ff"], - }, - ).contiguous() - eole_safetensor[eole_prefix + str(i) + target + param] = w + key_map = KEY_MAPS[hf.arch].get(section, {}) + for target, source in key_map.items(): + for param in params: + # TODO: this should be cleaned up when rationalizing encoder/decoder mappings + if not (isinstance(source, str) or isinstance(source, tuple)): + continue + srckey, srcmap = source if isinstance(source, tuple) else (source, None) + w = get_weight( + checkpoint, + hf_prefix + str(i) + srckey + param, + ) + + if w is not None: + if srcmap is not None: + w = eval( + "w" + srcmap, + { + "w": w, + "hidden_size": model_config["hidden_size"], + "transformer_ff": model_config["transformer_ff"], + }, + ).contiguous() + eole_safetensor[eole_prefix + str(i) + target + param] = w if model_config["shared_layer_norm"]: idx = 0 else: idx = 1 - # Not sure if we can handle these in the loop above - for p in ["weight", "bias"]: - for module in [ - "input_layernorm", - "layer_norm_res", - "precontext_layernorm", - "post_attention_layernorm", - "pre_feedforward_layernorm", - "post_feedforward_layernorm", - "mlp.gate", - ]: - if hf_prefix == hf.encoder_layer_prefix: - source_map = KEY_MAPS[hf.arch]["encoder"] - else: - source_map = KEY_MAPS[hf.arch] - module_p = f".{module}.{p}" - if module_p in source_map.keys(): - if isinstance(source_map[module_p], tuple): - w = get_weight( - checkpoint, - hf_prefix + str(i) + source_map[module_p][idx], - ) - else: - w = get_weight( - checkpoint, - hf_prefix + str(i) + source_map[module_p], - ) - if w is not None: - eole_safetensor[eole_prefix + str(i) + module_p] = w - - for j in range(model_config["num_experts"]): - if f".mlp.experts.{j}.layer_norm." + p in source_map.keys(): - w = get_weight( - checkpoint, - hf_prefix + str(i) + source_map[f".mlp.experts.{j}.layer_norm." + p], - ) - if w is not None: - eole_safetensor[eole_prefix + str(i) + f".mlp.experts.{j}.layer_norm." + p] = w # Convert to another dtype if specified if args.dtype is not None: From 261a9e8fb169dacfa31d3caeeb17bf31d53a1c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 7 May 2025 09:40:55 +0000 Subject: [PATCH 02/14] fix flake --- eole/bin/convert/HF_mappings.py | 7 ++----- eole/bin/convert/convert_HF.py | 16 +++++----------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index e68bb709..12877d4b 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -1,4 +1,6 @@ +from copy import deepcopy from collections import defaultdict + from eole.config.models import ( TransformerEncoderModelConfig, TransformerModelConfig, @@ -7,8 +9,6 @@ ) from eole.config import recursive_update_dict -from rich import print -from copy import deepcopy # Default tensor key mappings, based on Llama BASE_KEY_MAP = { @@ -111,11 +111,8 @@ ".mlp.gate_up_proj.": (".mlp.c_fc.", ".t()"), ".mlp.down_proj.": (".mlp.c_proj.", ".t()"), ".input_layernorm.": ".ln_1.", - ".input_layernorm.": ".ln_1.", - ".post_attention_layernorm.": ".ln_2.", ".post_attention_layernorm.": ".ln_2.", "decoder.layer_norm.": "ln_f.", - "decoder.layer_norm.": "ln_f.", }, "config": { "parallel_residual": False, diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 82ccb15a..6ba5b791 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -6,7 +6,6 @@ import math import os import re -from collections import defaultdict from dataclasses import dataclass, field, fields from typing import Optional @@ -22,12 +21,6 @@ # Eole Imports from eole.bin import BaseBin, register_bin from eole.config import recursive_model_fields_set -from eole.config.models import ( - TransformerEncoderModelConfig, - TransformerModelConfig, - TransformerLMModelConfig, - VisionTransformerLMModelConfig, -) from eole.config.run import TrainConfig from eole.config.training import TrainingConfig from eole.constants import DefaultTokens, TORCH_DTYPES, PositionEncodingType @@ -733,10 +726,11 @@ def build_first_shard(hf, eole_safetensor): ).contiguous() eole_safetensor[eole_prefix + str(i) + target + param] = w - if model_config["shared_layer_norm"]: - idx = 0 - else: - idx = 1 + # NOTE: not sure this was really needed + # if model_config["shared_layer_norm"]: + # idx = 0 + # else: + # idx = 1 # Convert to another dtype if specified if args.dtype is not None: From 03e5a1a4065a16c18e3014d0e79ed085d5ff0255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 7 May 2025 10:10:37 +0000 Subject: [PATCH 03/14] patch phi layer_norm --- eole/bin/convert/HF_mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 12877d4b..4794cf5f 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -68,7 +68,7 @@ } }, "PhiForCausalLM": { - "decoder.layer_norm.": "model.final_layernorm.", + "decoder.layer_norm.weight": "model.final_layernorm.weight", "decoder.layer_norm.bias": "model.final_layernorm.bias", "generator.bias": "lm_head.bias", "decoder": { From e434e0812658f104ec205d4c95d167e643867260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 9 May 2025 07:43:41 +0000 Subject: [PATCH 04/14] fix GPT2 conversion --- eole/bin/convert/HF_mappings.py | 5 ++++- eole/bin/convert/convert_HF.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 4794cf5f..f4e3e857 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -99,7 +99,11 @@ }, "GPT2LMHeadModel": { "decoder_layer_prefix": "h.", + "decoder.layer_norm.weight": "ln_f.weight", + "decoder.layer_norm.bias": "ln_f.bias", "tgt_emb.pe.weight": "wpe.weight", + "tgt_emb.embeddings.weight": "wte.weight", + "generator.weight": "wte.weight", # shared with embeddings "decoder": { ".self_attn.linear_query.": (".attn.c_attn.", ".t()[:hidden_size, ...]"), ".self_attn.linear_keys.": ( @@ -112,7 +116,6 @@ ".mlp.down_proj.": (".mlp.c_proj.", ".t()"), ".input_layernorm.": ".ln_1.", ".post_attention_layernorm.": ".ln_2.", - "decoder.layer_norm.": "ln_f.", }, "config": { "parallel_residual": False, diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 6ba5b791..db3e890b 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -525,7 +525,7 @@ def get_weight(checkpoint, tensor_name): def check_tokenizer_config(hf): config = hf.config - add_bos_token = hf.tokenizer_config.get("add_bos_token", hf.tokenizer_config.get("bos_token", False) is not None) + add_bos_token = hf.tokenizer_config.get("add_bos_token", hf.tokenizer_config.get("bos_token", None) is not None) chat_template = {"chat_template": hf.tokenizer_config.get("chat_template", None)} eos_token_id = config.get("eos_token_id", None) optional_eos = [] From bf58c86f06cac171c1aaa1f8a5bae3a0b16b58ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 9 May 2025 08:03:45 +0000 Subject: [PATCH 05/14] fix NLLB conversion --- eole/bin/convert/HF_mappings.py | 3 ++- eole/config/models.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index f4e3e857..9d60a145 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -273,7 +273,8 @@ ".post_attention_layernorm.": ".final_layer_norm.", }, "encoder_layer_prefix": "model.encoder.layers.", - "encoder.layer_norm.": "model.encoder.layer_norm.", + "encoder.layer_norm.weight": "model.encoder.layer_norm.weight", + "encoder.layer_norm.bias": "model.encoder.layer_norm.bias", "encoder": { ".self_attn.linear_query.": ".self_attn.q_proj.", ".self_attn.linear_keys.": ".self_attn.k_proj.", diff --git a/eole/config/models.py b/eole/config/models.py index ba383b03..e1a64af4 100644 --- a/eole/config/models.py +++ b/eole/config/models.py @@ -694,12 +694,16 @@ def encoder_decoder_type(cls, data: Any) -> Any: # patch to allow transparent setting of encoder/decoder_type if not (isinstance(data, dict)): return data - if "encoder" in data.keys(): + if isinstance(data.get("encoder", None), Config): data["encoder"].encoder_type = "transformer" + elif isinstance(data.get("encoder", None), dict): + data["encoder"]["encoder_type"] = "transformer" else: data["encoder"] = {"encoder_type": "transformer"} - if "decoder" in data.keys(): + if isinstance(data.get("decoder", None), Config): data["decoder"].decoder_type = "transformer" + elif isinstance(data.get("decoder", None), dict): + data["decoder"]["decoder_type"] = "transformer" else: data["decoder"] = {"decoder_type": "transformer"} return data From 816db0e9038115a6d33313a4cd80e7c67f1936c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 9 May 2025 08:59:53 +0000 Subject: [PATCH 06/14] patch XLMRoberta conversion --- eole/bin/convert/HF_mappings.py | 2 +- eole/bin/convert/convert_HF.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 9d60a145..6f291f6a 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -136,7 +136,7 @@ "encoder.layer_norm.bias": "roberta.encoder.LayerNorm.bias", "src_emb.embeddings.weight": "roberta.embeddings.word_embeddings.weight", "src_emb.pe.weight": "roberta.embeddings.position_embeddings.weight", - "decoder": { + "encoder": { ".self_attn.linear_query.": ".attention.self.query.", ".self_attn.linear_keys.": ".attention.self.key.", ".self_attn.linear_values.": ".attention.self.value.", diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index db3e890b..18c6199b 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -746,7 +746,13 @@ def build_first_shard(hf, eole_safetensor): def check_sentencepiece_tokenizer(hf): tokenizer_basename = os.path.basename(hf.tokenizer_model) if hf.tokenizer_json is not None: - vocab = list(hf.tokenizer["model"]["vocab"].keys()) + tokenizer_vocab = hf.tokenizer["model"]["vocab"] + if isinstance(tokenizer_vocab, dict): + vocab = list(hf.tokenizer["model"]["vocab"].keys()) + elif isinstance(tokenizer_vocab, list): + vocab = [token for token, freq in tokenizer_vocab] + else: + raise NotImplementedError(f"Type {type(tokenizer_vocab)} is not supported for SentencePiece vocab.") else: vocab = get_sentencepiece_vocab(hf.tokenizer_model) if hf.tokenizer_json is not None: From c4ae6787f60bfc0ef7fae6fc2cb249b1d4b98758 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 13 May 2025 08:04:40 +0000 Subject: [PATCH 07/14] make post_attention_layernorm configurable --- eole/bin/convert/HF_mappings.py | 1 + eole/config/models.py | 4 ++++ eole/decoders/transformer.py | 9 ++++++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 6f291f6a..6c5372e4 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -83,6 +83,7 @@ "add_qkvbias": True, "add_final_linear_bias": True, "add_ffnbias": True, + "post_attention_layernorm": False, }, }, "Phi3ForCausalLM": { diff --git a/eole/config/models.py b/eole/config/models.py index e1a64af4..1650db35 100644 --- a/eole/config/models.py +++ b/eole/config/models.py @@ -231,6 +231,10 @@ class TransformerConfig(Config): default=False, description="Add pre/post_feedforward_layernorm around MLP forward. " "Note: introduced for gemma2 support.", ) + post_attention_layernorm: bool = Field( + default=True, + description="Add post-attention layernorm around MHA forward.", + ) add_qkvbias: bool = Field( default=False, description="Add bias to nn.Linear of Query/Key/Value in MHA. " diff --git a/eole/decoders/transformer.py b/eole/decoders/transformer.py index f35c7ead..0ca6b03d 100644 --- a/eole/decoders/transformer.py +++ b/eole/decoders/transformer.py @@ -62,9 +62,12 @@ def __init__(self, decoder_config, running_config=None, with_cross_attn=False): self.residual_layernorm = LayerNorm[decoder_config.layer_norm]( decoder_config.hidden_size, eps=decoder_config.norm_eps ) - self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm]( - decoder_config.hidden_size, eps=decoder_config.norm_eps - ) + if decoder_config.post_attention_layernorm: + self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm]( + decoder_config.hidden_size, eps=decoder_config.norm_eps + ) + else: + self.post_attention_layernorm = nn.Identity() if decoder_config.num_experts > 0: self.mlp = MoE(decoder_config, running_config) else: From 10ad1759d8bbfb6e7bfe376be552177c83692aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 13 May 2025 08:41:24 +0000 Subject: [PATCH 08/14] patch is_seq2seq condition, mark phi-2 as validated --- eole/predict/inference.py | 2 +- recipes/model-validator/run.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/eole/predict/inference.py b/eole/predict/inference.py index 0796eed5..96476452 100644 --- a/eole/predict/inference.py +++ b/eole/predict/inference.py @@ -443,7 +443,7 @@ def _process_bucket(bucket_predictions): batch_data = self.predict_batch(batch, attn_debug) predictions = prediction_builder.from_batch(batch_data) - is_seq2seq = hasattr(self.model, "encoder") and hasattr(self.model, "decoder") + is_seq2seq = getattr(self.model, "encoder", None) is not None and getattr(self.model, "decoder", None) is not None if ( is_seq2seq and self._tgt_sep_idx != self._tgt_unk_idx diff --git a/recipes/model-validator/run.sh b/recipes/model-validator/run.sh index 88499ce6..e738997b 100755 --- a/recipes/model-validator/run.sh +++ b/recipes/model-validator/run.sh @@ -16,6 +16,7 @@ models=( "meta-llama/CodeLlama-7b-hf" "microsoft/Phi-3.5-mini-instruct" "microsoft/Phi-3-mini-128k-instruct" + "microsoft/phi-2" # to work on # "mistralai/Mathstral-7B-v0.1" # fp32 ! # "microsoft/Phi-3.5-MoE-instruct" # convert_HF not set for PhiMoEForCausalLM From a4035e254a800fef291a128c4fed9beb8c88c2a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Tue, 13 May 2025 08:42:25 +0000 Subject: [PATCH 09/14] black --- eole/bin/convert/HF_mappings.py | 2 +- eole/predict/inference.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 6c5372e4..c5982ef5 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -104,7 +104,7 @@ "decoder.layer_norm.bias": "ln_f.bias", "tgt_emb.pe.weight": "wpe.weight", "tgt_emb.embeddings.weight": "wte.weight", - "generator.weight": "wte.weight", # shared with embeddings + "generator.weight": "wte.weight", # shared with embeddings "decoder": { ".self_attn.linear_query.": (".attn.c_attn.", ".t()[:hidden_size, ...]"), ".self_attn.linear_keys.": ( diff --git a/eole/predict/inference.py b/eole/predict/inference.py index 96476452..23671b91 100644 --- a/eole/predict/inference.py +++ b/eole/predict/inference.py @@ -443,7 +443,9 @@ def _process_bucket(bucket_predictions): batch_data = self.predict_batch(batch, attn_debug) predictions = prediction_builder.from_batch(batch_data) - is_seq2seq = getattr(self.model, "encoder", None) is not None and getattr(self.model, "decoder", None) is not None + is_seq2seq = ( + getattr(self.model, "encoder", None) is not None and getattr(self.model, "decoder", None) is not None + ) if ( is_seq2seq and self._tgt_sep_idx != self._tgt_unk_idx From d8894115ff55bc570cd3bdc2b6f3000223ff97a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Wed, 14 May 2025 15:05:20 +0000 Subject: [PATCH 10/14] support Qwen3 and Qwen3MoE --- eole/bin/convert/HF_mappings.py | 32 ++++++++++++++++++++++++++ eole/bin/convert/convert_HF.py | 5 ++-- eole/config/models.py | 3 +++ eole/modules/moe.py | 1 + eole/modules/multi_headed_attn.py | 6 ++--- eole/modules/transformer_mlp.py | 13 +++++++---- recipes/model-validator/run.sh | 38 +++++++++++++++++++++++++++---- 7 files changed, 84 insertions(+), 14 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index c5982ef5..939aea02 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -43,6 +43,34 @@ "add_final_linear_bias": False, } }, + "Qwen3ForCausalLM": { + "decoder": { + ".self_attn.q_norm.": ".self_attn.q_norm.", + ".self_attn.k_norm.": ".self_attn.k_norm.", + }, + "config": { + "decoder": { + "query_norm": True, + "key_norm": True, + } + }, + }, + "Qwen3MoeForCausalLM": { + "decoder": { + ".self_attn.q_norm.": ".self_attn.q_norm.", + ".self_attn.k_norm.": ".self_attn.k_norm.", + ".mlp.gate.": ".mlp.gate.", + **{f".mlp.experts.{i}.gate_up_proj.": f".mlp.experts.{i}.gate_proj." for i in range(128)}, + **{f".mlp.experts.{i}.up_proj.": f".mlp.experts.{i}.up_proj." for i in range(128)}, + **{f".mlp.experts.{i}.down_proj.": f".mlp.experts.{i}.down_proj." for i in range(128)}, + }, + "config": { + "decoder": { + "query_norm": True, + "key_norm": True, + } + }, + }, "Gemma2ForCausalLM": { "decoder": { ".pre_feedforward_layernorm.": ".pre_feedforward_layernorm.", @@ -250,6 +278,10 @@ "embeddings": { "normalize": True, }, + "decoder": { + "query_norm": True, + "key_norm": True, + }, }, }, "M2M100ForConditionalGeneration": { diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 18c6199b..f9d0d32d 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -286,6 +286,7 @@ def build_config_dict(hf): config.get("n_head", config.get("n_heads", config.get("decoder_attention_heads", None))), ), # default 32 patch for mistral-community/pixtral-12b "transformer_ff": config.get("intermediate_size", config.get("decoder_ffn_dim", None)), + "transformer_ff_moe": config.get("moe_intermediate_size", None), "mlp_activation_fn": ACT_TABLE[arch], "layer_norm": LN_TABLE[arch], "heads_kv": config.get("multi_query", False) @@ -304,7 +305,7 @@ def build_config_dict(hf): config.get("layer_norm_epsilon", config.get("layer_norm_eps", 1e-5)), ), "sliding_window": config.get("sliding_window", 0) or 4096, - "num_experts": config.get("num_local_experts", 0), + "num_experts": config.get("num_local_experts", config.get("num_experts", 0)), "num_experts_per_tok": config.get("num_experts_per_tok", 0), "add_qkvbias": False, "add_final_linear_bias": False, @@ -360,8 +361,6 @@ def build_config_dict(hf): model_config["adapter"] = "gemma3" # for decoder model_config["decoder"] = { - "query_norm": True, - "key_norm": True, "rope_config": { "rotary_theta": 1000000, "scaling_type": "gemma3", diff --git a/eole/config/models.py b/eole/config/models.py index 1650db35..4f639c81 100644 --- a/eole/config/models.py +++ b/eole/config/models.py @@ -267,6 +267,9 @@ class TransformerConfig(Config): ) num_experts: int = Field(default=0, description="Number of experts for MoE models.") num_experts_per_tok: int = Field(default=2, description="Number of experts per token.") + transformer_ff_moe: int | None = Field( + default=None, description="Size of hidden transformer feed-forward for MoE models." + ) # These fields are set at EmbeddingsConfig level but will be copied here to be accessible in MHA position_encoding_type: PositionEncodingType | None = Field( default=PositionEncodingType.SinusoidalInterleaved, diff --git a/eole/modules/moe.py b/eole/modules/moe.py index 9a6f7b66..73363812 100644 --- a/eole/modules/moe.py +++ b/eole/modules/moe.py @@ -18,6 +18,7 @@ def __init__( MLP( model_config, running_config, + is_moe=True, ) for i in range(model_config.num_experts) ] diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py index 92a3e554..7d1e1f13 100644 --- a/eole/modules/multi_headed_attn.py +++ b/eole/modules/multi_headed_attn.py @@ -14,7 +14,7 @@ from .alibi_position_bias import AlibiPositionalBias from .rope import apply_rotary_emb -from eole.modules.rmsnorm import GemmaRMSNorm +from eole.constants import LayerNorm # Help functions to split model dim per head @@ -112,9 +112,9 @@ def __init__(self, model_config, running_config=None, is_decoder: bool = True) - # introduced for gemma3 if model_config.query_norm: - self.q_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps) + self.q_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps) if model_config.key_norm: - self.k_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps) + self.k_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps) self.final_linear = skip_init( nn.Linear, diff --git a/eole/modules/transformer_mlp.py b/eole/modules/transformer_mlp.py index d4106bcc..2063ee29 100644 --- a/eole/modules/transformer_mlp.py +++ b/eole/modules/transformer_mlp.py @@ -19,21 +19,26 @@ def __init__( self, model_config, running_config=None, + is_moe=False, ): self.parallel_gpu = getattr(running_config, "parallel_gpu", 1) super(MLP, self).__init__() + if is_moe: + ff_dim = model_config.transformer_ff_moe + else: + ff_dim = model_config.transformer_ff assert ( - model_config.transformer_ff % self.parallel_gpu == 0 + ff_dim % self.parallel_gpu == 0 ), "Model intermediate ffn size must be divisible by the number of partitions" self.gate_up_proj = skip_init( nn.Linear, in_features=model_config.hidden_size, - out_features=model_config.transformer_ff // self.parallel_gpu, + out_features=ff_dim // self.parallel_gpu, bias=model_config.add_ffnbias, ) self.down_proj = skip_init( nn.Linear, - in_features=model_config.transformer_ff // self.parallel_gpu, + in_features=ff_dim // self.parallel_gpu, out_features=model_config.hidden_size, bias=model_config.add_ffnbias, ) @@ -46,7 +51,7 @@ def __init__( skip_init( nn.Linear, in_features=model_config.hidden_size, - out_features=model_config.transformer_ff // self.parallel_gpu, + out_features=ff_dim // self.parallel_gpu, bias=model_config.add_ffnbias, ) if model_config.mlp_activation_fn in ["gated-silu", "gated-gelu", "gated-gelu-tanh"] diff --git a/recipes/model-validator/run.sh b/recipes/model-validator/run.sh index e738997b..5381e6d2 100755 --- a/recipes/model-validator/run.sh +++ b/recipes/model-validator/run.sh @@ -17,20 +17,48 @@ models=( "microsoft/Phi-3.5-mini-instruct" "microsoft/Phi-3-mini-128k-instruct" "microsoft/phi-2" + # Needs quantization to be tested on 24GB GPU + # "Qwen/Qwen3-30B-A3B|quant" + # seems ok + # "Qwen/Qwen3-0.6B" + # "Qwen/Qwen3-1.7B" + # "Qwen/Qwen3-4B" + # "Qwen/Qwen3-8B" + # "Qwen/Qwen3-14B" + # "Qwen/Qwen2-0.5B" + # "Qwen/Qwen2.5-0.5B" + # "Qwen/Qwen2.5-0.5B-Instruct" + # "Qwen/Qwen2-1.5B" + # "Qwen/Qwen2.5-1.5B" + # "Qwen/Qwen2.5-1.5B-Instruct" + # "Qwen/Qwen2.5-3B" + # "Qwen/Qwen2.5-3B-Instruct" # to work on # "mistralai/Mathstral-7B-v0.1" # fp32 ! # "microsoft/Phi-3.5-MoE-instruct" # convert_HF not set for PhiMoEForCausalLM # "microsoft/Phi-3-small-128k-instruct" # tokenizer to be taken from another model ) +QUANT_SETTINGS="--quant_type bnb_NF4 --quant_layers gate_up_proj down_proj up_proj linear_values linear_query linear_keys final_linear w_in w_out" + # Log file for errors ERROR_LOG="$SCRIPT_DIR/error_log.txt" echo "Error log for $(date)" > "$ERROR_LOG" # Loop through models -for model_path in "${models[@]}"; do +for model_entry in "${models[@]}"; do + IFS='|' read -r model_path model_flag <<< "$model_entry" model_name=$(basename "$model_path") + # Determine quantization + quant_args="" + if [[ "$model_flag" == "quant" ]]; then + echo "Quantization enabled for $model_name" + quant_args=$QUANT_SETTINGS + else + echo "Quantization disabled for $model_name" + fi + echo "=================================================" echo "Processing model: $model_name" echo "Path: $model_path" @@ -46,7 +74,7 @@ for model_path in "${models[@]}"; do # Step 1: Convert the model echo "Converting to $MODEL_DIR" - if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN"; then + if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN" --nshards 2; then echo "Error: Conversion failed for $model_name" | tee -a "$ERROR_LOG" continue fi @@ -54,6 +82,8 @@ for model_path in "${models[@]}"; do # Step 2: Prepare the prompt echo "Preparing prompt for testing:" PROMPT="What are some nice places to visit in France?" + # special tokens prompt (to check Qwen instruct models for instance) + # PROMPT="<|im_start|>user\nWhat are some nice places to visit in France?<|im_end|>\n<|im_start|>assistant\n" echo "\"$PROMPT\"" if ! echo -e "$PROMPT" | sed ':a;N;$!ba;s/\n/⦅newline⦆/g' > "$test_prompt_file"; then echo "Error: Failed to prepare prompt for $model_name" | tee -a "$ERROR_LOG" @@ -63,7 +93,7 @@ for model_path in "${models[@]}"; do # Step 3: Run prediction echo "Running prediction:" - if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file"; then + if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file" $QUANT_SETTINGS; then echo "Error: Prediction failed for $model_name" | tee -a "$ERROR_LOG" continue fi @@ -80,7 +110,7 @@ for model_path in "${models[@]}"; do # Step 5: Run MMLU echo "MMLU for $model_name:" echo "-------------------------------------------------" - if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents; then + if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents $QUANT_SETTINGS; then echo "Error: Failed to run MMLU for $model_name" | tee -a "$ERROR_LOG" continue fi From 2cb1d56f3929c389423a3ac8feafa8f5a46dd0f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 15 May 2025 08:14:22 +0000 Subject: [PATCH 11/14] fix mixtral mapping --- eole/bin/convert/HF_mappings.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 939aea02..8b99543b 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -88,11 +88,16 @@ "decoder": { ".mlp.gate.": ".block_sparse_moe.gate.", **{ - f".mlp.experts.{i}.{attr}": f".block_sparse_moe.experts.{i}.w{j}." + f".mlp.experts.{i}.{attr}": f".block_sparse_moe.experts.{i}.w{j+1}." for i in range(8) for j, attr in enumerate(["gate_up_proj.", "down_proj.", "up_proj."]) }, **{f".mlp.experts.{i}.layer_norm.weight": ".post_attention_layernorm.weight" for i in range(8)}, + }, + "config": { + "decoder": { + "transformer_ff_moe": 14336 + } } }, "PhiForCausalLM": { From 8adc9806fe7f0e1b87406887a1b884275179f0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 15 May 2025 08:16:05 +0000 Subject: [PATCH 12/14] update model-validator --- recipes/model-validator/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/model-validator/run.sh b/recipes/model-validator/run.sh index 5381e6d2..e2786a8f 100755 --- a/recipes/model-validator/run.sh +++ b/recipes/model-validator/run.sh @@ -34,6 +34,7 @@ models=( # "Qwen/Qwen2.5-3B" # "Qwen/Qwen2.5-3B-Instruct" # to work on + # "mistralai/Mixtral-8x7B-Instruct-v0.1|quant" # "mistralai/Mathstral-7B-v0.1" # fp32 ! # "microsoft/Phi-3.5-MoE-instruct" # convert_HF not set for PhiMoEForCausalLM # "microsoft/Phi-3-small-128k-instruct" # tokenizer to be taken from another model From ccb9cb26eab0bab8f2274871b55f6835dcd6457d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 15 May 2025 09:10:11 +0000 Subject: [PATCH 13/14] patch mixtral shared_layer_norm --- eole/bin/convert/HF_mappings.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 8b99543b..1f521b45 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -92,12 +92,13 @@ for i in range(8) for j, attr in enumerate(["gate_up_proj.", "down_proj.", "up_proj."]) }, - **{f".mlp.experts.{i}.layer_norm.weight": ".post_attention_layernorm.weight" for i in range(8)}, + ".post_attention_layernorm.weight": ".post_attention_layernorm.weight", }, "config": { "decoder": { "transformer_ff_moe": 14336 - } + }, + "shared_layer_norm": True, } }, "PhiForCausalLM": { From 11ab596d9bade08edc7e31b802435f7a3b9652b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 15 May 2025 12:44:30 +0000 Subject: [PATCH 14/14] black --- eole/bin/convert/HF_mappings.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py index 1f521b45..be30524a 100644 --- a/eole/bin/convert/HF_mappings.py +++ b/eole/bin/convert/HF_mappings.py @@ -95,11 +95,9 @@ ".post_attention_layernorm.weight": ".post_attention_layernorm.weight", }, "config": { - "decoder": { - "transformer_ff_moe": 14336 - }, + "decoder": {"transformer_ff_moe": 14336}, "shared_layer_norm": True, - } + }, }, "PhiForCausalLM": { "decoder.layer_norm.weight": "model.final_layernorm.weight",