Fix accessing final norm for Gemma-3 models (microsoft#1687)

kunal-vaishnavi · web-flow · commit bee5dca79368 · 2025-08-20T09:43:37.000-07:00
### Description This PR fixes how the final norm is identified for the Gemma-3 models. It works with the latest version of Hugging Face's `transformers` (v4.55.2). ### Motivation and Context Previous versions of `transformers` would modify the class structure for the Gemma-3 models as breaking changes. Since `transformers` has [landed on a stable way](huggingface/transformers#36741) to load multi-modal models with `AutoModelForCausalLM` for now, the current approach is to identify the path to `model.model.language_model.norm` for the Gemma-3 models that are multi-modal. Gemma-3 1B's final norm is accessible at `model.model.norm` while Gemma-3 4B's final norm is accessible at `model.model.language_model.norm`. For [PEFT's](https://github.com/huggingface/peft) decoder-only models, the core model is accessible at `model.base_model.model` and the final norm is usually accessible at `model.base_model.model.model.norm`. We can read the parent-most class name to identify whether a model is from PEFT or not. One advantage with this approach is that it allows any adaptations in the path to the final norm of a Transformers model to still be found in the PEFT version of that model.
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -2713,31 +2713,27 @@ def make_model(self, input_path):
     def has_final_norm(self, module, orig_model):
         # Find where the language model is stored to check attributes. Some classes
         # store the language model in a different attribute than `model.model`.
-        if hasattr(orig_model, "language_model"):
-            # Model is multimodal
-            # Note: This case is checked first because the `language_model` attribute and the `base_model` attribute
-            # exist for both multimodal models and PEFT models. However they represent different classes and their attributes
-            # differ.
-            model = orig_model.language_model
-        elif hasattr(orig_model, "base_model") and hasattr(orig_model.base_model, "model"):
-            if hasattr(orig_model.base_model.model, "model"):
-                # Model is from PEFT
-                model = orig_model.base_model.model
-            else:
-                # Model is text-based only.
-                model = orig_model.base_model
+        if orig_model.__class__.__name__.startswith("Peft"):
+            # Model is from PEFT
+            model = orig_model.base_model.model
         else:
             model = orig_model
 
-        # Hugging Face names
+        # Hugging Face names (all models loaded with AutoModelForCausalLM.from_pretrained)
+        #
+        # hf_norm:                        for most models
+        # hf_final_layernorm:             for Phi-2
+        # hf_transformer_final_layernorm: for ChatGLM-3
+        # hf_language_model_norm:         for Gemma-3 multimodal (4B, 12B, 27B)
         hf_norm = hasattr(model, "model") and hasattr(model.model, "norm") and module == model.model.norm
         hf_final_layernorm = hasattr(model, "model") and hasattr(model.model, "final_layernorm") and module == model.model.final_layernorm
         hf_transformer_final_layernorm = hasattr(model, "transformer") and hasattr(model.transformer, "encoder") and hasattr(model.transformer.encoder, "final_layernorm") and module == model.transformer.encoder.final_layernorm
+        hf_language_model_norm = hasattr(model, "model") and hasattr(model.model, "language_model") and hasattr(model.model.language_model, "norm") and module == model.model.language_model.norm
 
-        # GGUF names
+        # GGUF names (all models loaded with GGUFModel.from_pretrained)
         gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
 
-        hf_names = [hf_norm, hf_final_layernorm, hf_transformer_final_layernorm]
+        hf_names = [hf_norm, hf_final_layernorm, hf_transformer_final_layernorm, hf_language_model_norm]
         gguf_names = [gguf_final_norm]
         return any(hf_names + gguf_names)
 
@@ -3264,7 +3260,7 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location):
         super().make_layernorm(layer_id, layernorm, skip, simple, location)
 
     def make_layer(self, layer_id, layer):
-        # Gemma2 decoder layer is typically defined as:
+        # Gemma-2 decoder layer is typically defined as:
         # input_layernorm --> attention --> post_attention_layernorm --> pre_ffn_layernorm --> MLP --> post_ffn_layernorm
 
         # Adjust LayerNorm attributes because of extra LayerNorms inserted
@@ -3713,7 +3709,7 @@ def make_layer(self, layer_id, layer):
 class Gemma3Model(Gemma2Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        self.is_local = lambda layer_id: bool((layer_id + 1) % config.sliding_window_pattern)
+        self.is_local = lambda layer_id: bool((layer_id + 1) % 6)
         self.rope_local_theta = config.rope_local_base_freq
         self.make_rotary_embedding_multi_cache()