Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
393 changes: 393 additions & 0 deletions eole/bin/convert/HF_mappings.py

Large diffs are not rendered by default.

566 changes: 93 additions & 473 deletions eole/bin/convert/convert_HF.py

Large diffs are not rendered by default.

15 changes: 13 additions & 2 deletions eole/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,10 @@ class TransformerConfig(Config):
default=False,
description="Add pre/post_feedforward_layernorm around MLP forward. " "Note: introduced for gemma2 support.",
)
post_attention_layernorm: bool = Field(
default=True,
description="Add post-attention layernorm around MHA forward.",
)
add_qkvbias: bool = Field(
default=False,
description="Add bias to nn.Linear of Query/Key/Value in MHA. "
Expand Down Expand Up @@ -263,6 +267,9 @@ class TransformerConfig(Config):
)
num_experts: int = Field(default=0, description="Number of experts for MoE models.")
num_experts_per_tok: int = Field(default=2, description="Number of experts per token.")
transformer_ff_moe: int | None = Field(
default=None, description="Size of hidden transformer feed-forward for MoE models."
)
# These fields are set at EmbeddingsConfig level but will be copied here to be accessible in MHA
position_encoding_type: PositionEncodingType | None = Field(
default=PositionEncodingType.SinusoidalInterleaved,
Expand Down Expand Up @@ -694,12 +701,16 @@ def encoder_decoder_type(cls, data: Any) -> Any:
# patch to allow transparent setting of encoder/decoder_type
if not (isinstance(data, dict)):
return data
if "encoder" in data.keys():
if isinstance(data.get("encoder", None), Config):
data["encoder"].encoder_type = "transformer"
elif isinstance(data.get("encoder", None), dict):
data["encoder"]["encoder_type"] = "transformer"
else:
data["encoder"] = {"encoder_type": "transformer"}
if "decoder" in data.keys():
if isinstance(data.get("decoder", None), Config):
data["decoder"].decoder_type = "transformer"
elif isinstance(data.get("decoder", None), dict):
data["decoder"]["decoder_type"] = "transformer"
else:
data["decoder"] = {"decoder_type": "transformer"}
return data
Expand Down
9 changes: 6 additions & 3 deletions eole/decoders/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ def __init__(self, decoder_config, running_config=None, with_cross_attn=False):
self.residual_layernorm = LayerNorm[decoder_config.layer_norm](
decoder_config.hidden_size, eps=decoder_config.norm_eps
)
self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm](
decoder_config.hidden_size, eps=decoder_config.norm_eps
)
if decoder_config.post_attention_layernorm:
self.post_attention_layernorm = LayerNorm[decoder_config.layer_norm](
decoder_config.hidden_size, eps=decoder_config.norm_eps
)
else:
self.post_attention_layernorm = nn.Identity()
if decoder_config.num_experts > 0:
self.mlp = MoE(decoder_config, running_config)
else:
Expand Down
1 change: 1 addition & 0 deletions eole/modules/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
MLP(
model_config,
running_config,
is_moe=True,
)
for i in range(model_config.num_experts)
]
Expand Down
6 changes: 3 additions & 3 deletions eole/modules/multi_headed_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .alibi_position_bias import AlibiPositionalBias
from .rope import apply_rotary_emb

from eole.modules.rmsnorm import GemmaRMSNorm
from eole.constants import LayerNorm


# Help functions to split model dim per head
Expand Down Expand Up @@ -112,9 +112,9 @@ def __init__(self, model_config, running_config=None, is_decoder: bool = True) -

# introduced for gemma3
if model_config.query_norm:
self.q_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps)
self.q_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps)
if model_config.key_norm:
self.k_norm = GemmaRMSNorm(model_config.head_dim, eps=model_config.norm_eps)
self.k_norm = LayerNorm[model_config.layer_norm](model_config.head_dim, eps=model_config.norm_eps)

self.final_linear = skip_init(
nn.Linear,
Expand Down
13 changes: 9 additions & 4 deletions eole/modules/transformer_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,26 @@ def __init__(
self,
model_config,
running_config=None,
is_moe=False,
):
self.parallel_gpu = getattr(running_config, "parallel_gpu", 1)
super(MLP, self).__init__()
if is_moe:
ff_dim = model_config.transformer_ff_moe
else:
ff_dim = model_config.transformer_ff
assert (
model_config.transformer_ff % self.parallel_gpu == 0
ff_dim % self.parallel_gpu == 0
), "Model intermediate ffn size must be divisible by the number of partitions"
self.gate_up_proj = skip_init(
nn.Linear,
in_features=model_config.hidden_size,
out_features=model_config.transformer_ff // self.parallel_gpu,
out_features=ff_dim // self.parallel_gpu,
bias=model_config.add_ffnbias,
)
self.down_proj = skip_init(
nn.Linear,
in_features=model_config.transformer_ff // self.parallel_gpu,
in_features=ff_dim // self.parallel_gpu,
out_features=model_config.hidden_size,
bias=model_config.add_ffnbias,
)
Expand All @@ -46,7 +51,7 @@ def __init__(
skip_init(
nn.Linear,
in_features=model_config.hidden_size,
out_features=model_config.transformer_ff // self.parallel_gpu,
out_features=ff_dim // self.parallel_gpu,
bias=model_config.add_ffnbias,
)
if model_config.mlp_activation_fn in ["gated-silu", "gated-gelu", "gated-gelu-tanh"]
Expand Down
4 changes: 3 additions & 1 deletion eole/predict/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,9 @@ def _process_bucket(bucket_predictions):
batch_data = self.predict_batch(batch, attn_debug)

predictions = prediction_builder.from_batch(batch_data)
is_seq2seq = hasattr(self.model, "encoder") and hasattr(self.model, "decoder")
is_seq2seq = (
getattr(self.model, "encoder", None) is not None and getattr(self.model, "decoder", None) is not None
)
if (
is_seq2seq
and self._tgt_sep_idx != self._tgt_unk_idx
Expand Down
40 changes: 36 additions & 4 deletions recipes/model-validator/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,50 @@ models=(
"meta-llama/CodeLlama-7b-hf"
"microsoft/Phi-3.5-mini-instruct"
"microsoft/Phi-3-mini-128k-instruct"
"microsoft/phi-2"
# Needs quantization to be tested on 24GB GPU
# "Qwen/Qwen3-30B-A3B|quant"
# seems ok
# "Qwen/Qwen3-0.6B"
# "Qwen/Qwen3-1.7B"
# "Qwen/Qwen3-4B"
# "Qwen/Qwen3-8B"
# "Qwen/Qwen3-14B"
# "Qwen/Qwen2-0.5B"
# "Qwen/Qwen2.5-0.5B"
# "Qwen/Qwen2.5-0.5B-Instruct"
# "Qwen/Qwen2-1.5B"
# "Qwen/Qwen2.5-1.5B"
# "Qwen/Qwen2.5-1.5B-Instruct"
# "Qwen/Qwen2.5-3B"
# "Qwen/Qwen2.5-3B-Instruct"
# to work on
# "mistralai/Mixtral-8x7B-Instruct-v0.1|quant"
# "mistralai/Mathstral-7B-v0.1" # fp32 !
# "microsoft/Phi-3.5-MoE-instruct" # convert_HF not set for PhiMoEForCausalLM
# "microsoft/Phi-3-small-128k-instruct" # tokenizer to be taken from another model
)

QUANT_SETTINGS="--quant_type bnb_NF4 --quant_layers gate_up_proj down_proj up_proj linear_values linear_query linear_keys final_linear w_in w_out"

# Log file for errors
ERROR_LOG="$SCRIPT_DIR/error_log.txt"
echo "Error log for $(date)" > "$ERROR_LOG"

# Loop through models
for model_path in "${models[@]}"; do
for model_entry in "${models[@]}"; do
IFS='|' read -r model_path model_flag <<< "$model_entry"
model_name=$(basename "$model_path")

# Determine quantization
quant_args=""
if [[ "$model_flag" == "quant" ]]; then
echo "Quantization enabled for $model_name"
quant_args=$QUANT_SETTINGS
else
echo "Quantization disabled for $model_name"
fi

echo "================================================="
echo "Processing model: $model_name"
echo "Path: $model_path"
Expand All @@ -45,14 +75,16 @@ for model_path in "${models[@]}"; do

# Step 1: Convert the model
echo "Converting to $MODEL_DIR"
if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN"; then
if ! eole convert HF --model_dir "$model_path" --output "$MODEL_DIR" --token "$HF_TOKEN" --nshards 2; then
echo "Error: Conversion failed for $model_name" | tee -a "$ERROR_LOG"
continue
fi

# Step 2: Prepare the prompt
echo "Preparing prompt for testing:"
PROMPT="What are some nice places to visit in France?"
# special tokens prompt (to check Qwen instruct models for instance)
# PROMPT="<|im_start|>user\nWhat are some nice places to visit in France?<|im_end|>\n<|im_start|>assistant\n"
echo "\"$PROMPT\""
if ! echo -e "$PROMPT" | sed ':a;N;$!ba;s/\n/⦅newline⦆/g' > "$test_prompt_file"; then
echo "Error: Failed to prepare prompt for $model_name" | tee -a "$ERROR_LOG"
Expand All @@ -62,7 +94,7 @@ for model_path in "${models[@]}"; do

# Step 3: Run prediction
echo "Running prediction:"
if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file"; then
if ! eole predict -model_path "$MODEL_DIR" -gpu_ranks 0 -src "$test_prompt_file" -output "$test_output_file" $QUANT_SETTINGS; then
echo "Error: Prediction failed for $model_name" | tee -a "$ERROR_LOG"
continue
fi
Expand All @@ -79,7 +111,7 @@ for model_path in "${models[@]}"; do
# Step 5: Run MMLU
echo "MMLU for $model_name:"
echo "-------------------------------------------------"
if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents; then
if ! eole tools run_mmlu -model_path "$MODEL_DIR" -gpu_ranks 0 -batch_size 1 -batch_type sents $QUANT_SETTINGS; then
echo "Error: Failed to run MMLU for $model_name" | tee -a "$ERROR_LOG"
continue
fi
Expand Down