feat: load only text weights from multimodal gemma (#2008)

pquadri · web-flow · commit 552ac10b6d03 · 2025-04-17T15:20:59.000+02:00
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -4,6 +4,7 @@
 import json
 import os
 import re
+import warnings
 from collections import defaultdict
 from functools import partial
 from pathlib import Path
@@ -294,6 +295,7 @@ def copy_weights_gemma_3(
     pbar: Optional[tqdm] = None,
     progress_per_file: Optional[float] = None,
     debug_mode: Optional[bool] = False,
+    config: Optional[Config] = None,
 ) -> None:
     weight_map = {
         "model.embed_tokens.weight": "transformer.wte.weight",
@@ -316,11 +318,20 @@ def copy_weights_gemma_3(
 
     if progress_per_file is not None:
         progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
-
+    # gemma3 4b+ are multimodel models, but we are only loading the text weights
+    is_multimodal = any(k.startswith("language_model") for k in hf_weights)
+    if is_multimodal:
+        warnings.warn("For Gemma3 models only the text component is supported.")
+        weight_map = {f"language_model.{k}": v for k, v in weight_map.items()}
     for from_name, param in hf_weights.items():
+        if from_name.startswith("vision_tower") or from_name.startswith("multi_modal_projector"):
+            continue
         name_template, *ids = layer_template(from_name, num_matches=2)
         to_name = weight_map[name_template]
         param = load_param(param, from_name, dtype, verbose=debug_mode)
+        # in multimodal models, the text weights are the first part of the weights
+        if is_multimodal and to_name == "transformer.wte.weight" and config is not None:
+            param = param[: config.vocab_size]
         if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
             qkv = qkv_weights.setdefault(ids[0], defaultdict(dict))
             weight_name, weight_type = from_name.split(".")[-2:]
@@ -604,7 +615,7 @@ def convert_hf_checkpoint(
         copy_fn = partial(copy_weights_gemma_2, qkv_weights)
     elif model_name.lower().startswith("gemma-3"):
         qkv_weights = {}
-        copy_fn = partial(copy_weights_gemma_3, qkv_weights)
+        copy_fn = partial(copy_weights_gemma_3, qkv_weights, config=config)
     elif model_name.lower().startswith("phi"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -23,7 +23,7 @@
 from transformers.models.falcon import FalconConfig, FalconForCausalLM
 from transformers.models.gemma import GemmaConfig, GemmaForCausalLM
 from transformers.models.gemma2 import Gemma2Config, Gemma2ForCausalLM
-from transformers.models.gemma3 import Gemma3ForCausalLM, Gemma3TextConfig
+from transformers.models.gemma3 import Gemma3Config, Gemma3ForCausalLM, Gemma3ForConditionalGeneration, Gemma3TextConfig
 from transformers.models.gpt_neox import GPTNeoXConfig, GPTNeoXForCausalLM
 from transformers.models.llama import LlamaConfig, LlamaForCausalLM
 from transformers.models.mistral import MistralConfig, MistralForCausalLM
@@ -872,6 +872,78 @@ def test_against_original_gemma_3(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ["gemma-3-4b-it", "gemma-3-12b-it", "gemma-3-27b-it"])
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                _RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_multimodal_gemma_3(model_name, device, dtype):
+    torch.set_default_dtype(dtype)
+
+    T = 20
+    ours_config = Config.from_name(
+        model_name,
+        block_size=T,
+        sliding_window_size=T // 2,
+        n_layer=2,
+        n_head=16,
+        n_embd=32,
+        intermediate_size=86,
+    )
+
+    theirs_config = Gemma3Config(
+        Gemma3TextConfig(
+            vocab_size=ours_config.padded_vocab_size,
+            hidden_size=ours_config.n_embd,
+            head_dim=ours_config.head_size,
+            num_attention_heads=ours_config.n_head,
+            num_hidden_layers=ours_config.n_layer,
+            intermediate_size=ours_config.intermediate_size,
+            max_position_embeddings=ours_config.block_size,
+            sliding_window=ours_config.sliding_window_size,
+            rms_norm_eps=ours_config.norm_eps,
+            num_key_value_heads=ours_config.n_query_groups,
+            rope_theta=ours_config.rope_base,
+            attention_bias=ours_config.bias,
+            tie_word_embeddings=True,
+            hidden_act="gelu_pytorch_tanh",
+            attn_implementation="eager",
+            query_pre_attn_scalar=ours_config.attention_scores_scalar,
+            rope_scaling={"factor": 8.0, "rope_type": "linear"},
+            rope_local_base_freq=ours_config.rope_local_base_freq,
+        )
+    )
+
+    theirs_model = Gemma3ForConditionalGeneration(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+
+    state_dict = {}
+
+    copy_weights_gemma_3({}, state_dict, theirs_state_dict, config=ours_config)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
+
+
 @torch.inference_mode()
 @pytest.mark.parametrize(
     "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]