eole-nlp · francoishernandez · May 7, 2025 · May 7, 2025 · May 7, 2025 · May 9, 2025
diff --git a/eole/bin/convert/HF_mappings.py b/eole/bin/convert/HF_mappings.py
diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py
diff --git a/eole/config/models.py b/eole/config/models.py
@@ -231,6 +231,10 @@ class TransformerConfig(Config):
         default=False,
         description="Add pre/post_feedforward_layernorm around MLP forward. " "Note: introduced for gemma2 support.",
     )
+    post_attention_layernorm: bool = Field(
+        default=True,
+        description="Add post-attention layernorm around MHA forward.",
+    )
     add_qkvbias: bool = Field(
         default=False,
         description="Add bias to nn.Linear of Query/Key/Value in MHA. "
@@ -263,6 +267,9 @@ class TransformerConfig(Config):
     )
     num_experts: int = Field(default=0, description="Number of experts for MoE models.")
     num_experts_per_tok: int = Field(default=2, description="Number of experts per token.")
+    transformer_ff_moe: int | None = Field(
+        default=None, description="Size of hidden transformer feed-forward for MoE models."
+    )
     # These fields are set at EmbeddingsConfig level but will be copied here to be accessible in MHA
     position_encoding_type: PositionEncodingType | None = Field(
         default=PositionEncodingType.SinusoidalInterleaved,
@@ -694,12 +701,16 @@ def encoder_decoder_type(cls, data: Any) -> Any:
         # patch to allow transparent setting of encoder/decoder_type
         if not (isinstance(data, dict)):
             return data
-        if "encoder" in data.keys():
+        if isinstance(data.get("encoder", None), Config):
             data["encoder"].encoder_type = "transformer"
+        elif isinstance(data.get("encoder", None), dict):
+            data["encoder"]["encoder_type"] = "transformer"
         else:
             data["encoder"] = {"encoder_type": "transformer"}
-        if "decoder" in data.keys():
+        if isinstance(data.get("decoder", None), Config):
             data["decoder"].decoder_type = "transformer"
+        elif isinstance(data.get("decoder", None), dict):
+            data["decoder"]["decoder_type"] = "transformer"
         else:
             data["decoder"] = {"decoder_type": "transformer"}
         return data

diff --git a/eole/decoders/transformer.py b/eole/decoders/transformer.py
@@ -62,9 +62,12 @@ def __init__(self, decoder_config, running_config=None, with_cross_attn=False):
             self.residual_layernorm = LayerNorm[decoder_config.layer_norm](
                 decoder_config.hidden_size, eps=decoder_config.norm_eps
             )
-        self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm](
-            decoder_config.hidden_size, eps=decoder_config.norm_eps
-        )
+        if decoder_config.post_attention_layernorm:
+            self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm](
+                decoder_config.hidden_size, eps=decoder_config.norm_eps
+            )
+        else:
+            self.post_attention_layernorm = nn.Identity()
         if decoder_config.num_experts > 0:
             self.mlp = MoE(decoder_config, running_config)
         else:

diff --git a/eole/modules/moe.py b/eole/modules/moe.py
@@ -18,6 +18,7 @@ def __init__(
                 MLP(
                     model_config,
                     running_config,
+                    is_moe=True,
                 )
                 for i in range(model_config.num_experts)
             ]

diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py
@@ -14,7 +14,7 @@
 from .alibi_position_bias import AlibiPositionalBias
 from .rope import apply_rotary_emb
 
-from eole.modules.rmsnorm import GemmaRMSNorm
+from eole.constants import LayerNorm
 
 
 # Help functions to split model dim per head
@@ -112,9 +112,9 @@ def __init__(self, model_config, running_config=None, is_decoder: bool = True) -
 
         # introduced for gemma3
         if model_config.query_norm:
-            self.q_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps)
+            self.q_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps)
         if model_config.key_norm:
-            self.k_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps)
+            self.k_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps)
 
         self.final_linear = skip_init(
             nn.Linear,

diff --git a/eole/modules/transformer_mlp.py b/eole/modules/transformer_mlp.py
@@ -19,21 +19,26 @@ def __init__(
         self,
         model_config,
         running_config=None,
+        is_moe=False,
     ):
         self.parallel_gpu = getattr(running_config, "parallel_gpu", 1)
         super(MLP, self).__init__()
+        if is_moe:
+            ff_dim = model_config.transformer_ff_moe
+        else:
+            ff_dim = model_config.transformer_ff
         assert (
-            model_config.transformer_ff % self.parallel_gpu == 0
+            ff_dim % self.parallel_gpu == 0
         ), "Model intermediate ffn size must be divisible by the number of partitions"
         self.gate_up_proj = skip_init(
             nn.Linear,
             in_features=model_config.hidden_size,
-            out_features=model_config.transformer_ff // self.parallel_gpu,
+            out_features=ff_dim // self.parallel_gpu,
             bias=model_config.add_ffnbias,
         )
         self.down_proj = skip_init(
             nn.Linear,
-            in_features=model_config.transformer_ff // self.parallel_gpu,
+            in_features=ff_dim // self.parallel_gpu,
             out_features=model_config.hidden_size,
             bias=model_config.add_ffnbias,
         )
@@ -46,7 +51,7 @@ def __init__(
             skip_init(
                 nn.Linear,
                 in_features=model_config.hidden_size,
-                out_features=model_config.transformer_ff // self.parallel_gpu,
+                out_features=ff_dim // self.parallel_gpu,
                 bias=model_config.add_ffnbias,
             )
             if model_config.mlp_activation_fn in ["gated-silu", "gated-gelu", "gated-gelu-tanh"]

diff --git a/eole/predict/inference.py b/eole/predict/inference.py
@@ -443,7 +443,9 @@ def _process_bucket(bucket_predictions):
             batch_data = self.predict_batch(batch, attn_debug)
 
             predictions = prediction_builder.from_batch(batch_data)
-            is_seq2seq = hasattr(self.model, "encoder") and hasattr(self.model, "decoder")
+            is_seq2seq = (
+                getattr(self.model, "encoder", None) is not None and getattr(self.model, "decoder", None) is not None
+            )
             if (
                 is_seq2seq
                 and self._tgt_sep_idx != self._tgt_unk_idx

diff --git a/recipes/model-validator/run.sh b/recipes/model-validator/run.sh
@@ -16,20 +16,50 @@ models=(
     "meta-llama/CodeLlama-7b-hf"
     "microsoft/Phi-3.5-mini-instruct"
     "microsoft/Phi-3-mini-128k-instruct"
+    "microsoft/phi-2"
+    # Needs quantization to be tested on 24GB GPU
+    # "Qwen/Qwen3-30B-A3B|quant"
+    # seems ok
+    # "Qwen/Qwen3-0.6B"
+    # "Qwen/Qwen3-1.7B"
+    # "Qwen/Qwen3-4B"
+    # "Qwen/Qwen3-8B"
+    # "Qwen/Qwen3-14B"
+    # "Qwen/Qwen2-0.5B"
+    # "Qwen/Qwen2.5-0.5B"
+    # "Qwen/Qwen2.5-0.5B-Instruct"
+    # "Qwen/Qwen2-1.5B"
+    # "Qwen/Qwen2.5-1.5B"
+    # "Qwen/Qwen2.5-1.5B-Instruct"
+    # "Qwen/Qwen2.5-3B"
+    # "Qwen/Qwen2.5-3B-Instruct"
     # to work on
+    # "mistralai/Mixtral-8x7B-Instruct-v0.1|quant"
     # "mistralai/Mathstral-7B-v0.1" # fp32 !
     # "microsoft/Phi-3.5-MoE-instruct" # convert_HF not set for PhiMoEForCausalLM
     # "microsoft/Phi-3-small-128k-instruct" # tokenizer to be taken from another model
 )
 
+QUANT_SETTINGS="--quant_type bnb_NF4 --quant_layers gate_up_proj down_proj up_proj linear_values linear_query linear_keys final_linear w_in w_out"
+
 # Log file for errors
 ERROR_LOG="$SCRIPT_DIR/error_log.txt"
 echo "Error log for $(date)" > "$ERROR_LOG"
 
 # Loop through models
-for model_path in "${models[@]}"; do
+for model_entry in "${models[@]}"; do
+  IFS='|' read -r model_path model_flag <<< "$model_entry"
   model_name=$(basename "$model_path")
 
+  # Determine quantization
+  quant_args=""
+  if [[ "$model_flag" == "quant" ]]; then
+    echo "Quantization enabled for $model_name"
+    quant_args=$QUANT_SETTINGS
+  else
+    echo "Quantization disabled for $model_name"
+  fi
+
   echo "================================================="
   echo "Processing model: $model_name"
   echo "Path: $model_path"
@@ -45,14 +75,16 @@ for model_path in "${models[@]}"; do
 
   # Step 1: Convert the model
   echo "Converting to $MODEL_DIR"
-  if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN"; then
+  if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN" --nshards 2; then
     echo "Error: Conversion failed for $model_name" | tee -a "$ERROR_LOG"
     continue
   fi
 
   # Step 2: Prepare the prompt
   echo "Preparing prompt for testing:"
   PROMPT="What are some nice places to visit in France?"
+  # special tokens prompt (to check Qwen instruct models for instance)
+  # PROMPT="<|im_start|>user\nWhat are some nice places to visit in France?<|im_end|>\n<|im_start|>assistant\n"
   echo "\"$PROMPT\""
   if ! echo -e "$PROMPT" | sed ':a;N;$!ba;s/\n/｟newline｠/g' > "$test_prompt_file"; then
     echo "Error: Failed to prepare prompt for $model_name" | tee -a "$ERROR_LOG"
@@ -62,7 +94,7 @@ for model_path in "${models[@]}"; do
 
   # Step 3: Run prediction
   echo "Running prediction:"
-  if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file"; then
+  if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file" $QUANT_SETTINGS; then
     echo "Error: Prediction failed for $model_name" | tee -a "$ERROR_LOG"
     continue
   fi
@@ -79,7 +111,7 @@ for model_path in "${models[@]}"; do
   # Step 5: Run MMLU
   echo "MMLU for $model_name:"
   echo "-------------------------------------------------"
-  if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents; then
+  if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents $QUANT_SETTINGS; then
     echo "Error: Failed to run MMLU for $model_name" | tee -a "$ERROR_LOG"
     continue
   fi