Fix in convert_hf_checkpoint related to Gemma 3 (#2062)

mseeger · pre-commit-ci[bot] · Borda · web-flow · commit 3d33a0545e54 · 2025-06-16T15:25:18.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
diff --git a/.azure/gpu-test.yml b/.azure/gpu-test.yml
@@ -93,6 +93,8 @@ jobs:
         env:
           PL_RUN_STANDALONE_TESTS: "1"
           # NUM_PARALLEL_TESTS: "10"
+          NCCL_IGNORE_DISABLED_P2P: "1"
+          NCCL_DEBUG: "INFO"
         timeoutInMinutes: "10"
 
       - bash: |
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -13,6 +13,7 @@
 
 import torch
 from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor
+from lightning_utilities.core.imports import RequirementCache
 from safetensors.torch import load_file as load_safetensors
 from tqdm import tqdm
 
@@ -286,6 +287,17 @@ def copy_weights_gemma_2(
                 pbar.update(progress_per_file)
 
 
+_TRANSFORMERS_GREATER_EQUAL_4_52 = RequirementCache("transformers>=4.52.0")
+
+GEMMA3_LANGUAGE_MODEL_PREFIX = "model.language_model" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "language_model.model"
+
+GEMMA3_VISION_MODEL_PREFIX = "model.vision_tower" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "vision_tower"
+
+GEMMA3_MM_PROJECTOR_PREFIX = (
+    "model.multi_modal_projector" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "multi_modal_projector"
+)
+
+
 def copy_weights_gemma_3(
     qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
     state_dict: Dict[str, torch.Tensor],
@@ -319,15 +331,21 @@ def copy_weights_gemma_3(
     if progress_per_file is not None:
         progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
     # gemma3 4b+ are multimodel models, but we are only loading the text weights
-    is_multimodal = any(k.startswith("language_model") for k in hf_weights)
+    is_multimodal = any(k.startswith(GEMMA3_LANGUAGE_MODEL_PREFIX) for k in hf_weights)
     if is_multimodal:
         warnings.warn("For Gemma3 models only the text component is supported.")
-        weight_map = {f"language_model.{k}": v for k, v in weight_map.items()}
+        new_weight_map = dict()
+        prefix = "model"
+        for k, v in weight_map.items():
+            if k.startswith(prefix):
+                k = GEMMA3_LANGUAGE_MODEL_PREFIX + k[len(prefix) :]
+            new_weight_map[k] = v
+        weight_map = new_weight_map
     for from_name, param in hf_weights.items():
-        if from_name.startswith("vision_tower") or from_name.startswith("multi_modal_projector"):
+        if from_name.startswith(GEMMA3_VISION_MODEL_PREFIX) or from_name.startswith(GEMMA3_MM_PROJECTOR_PREFIX):
             continue
         name_template, *ids = layer_template(from_name, num_matches=2)
-        to_name = weight_map[name_template]
+        to_name = weight_map.get(name_template)
         param = load_param(param, from_name, dtype, verbose=debug_mode)
         # in multimodal models, the text weights are the first part of the weights
         if is_multimodal and to_name == "transformer.wte.weight" and config is not None: