Fix in convert_hf_checkpoint related to Gemma 3 (Lightning-AI#2062)

mseeger · pre-commit-ci[bot] · Borda · mseeger · commit aa5609da8b0f · 2025-07-04T17:07:20.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
diff --git a/.azure/gpu-test.yml b/.azure/gpu-test.yml
@@ -93,6 +93,8 @@ jobs:
         env:
           PL_RUN_STANDALONE_TESTS: "1"
           # NUM_PARALLEL_TESTS: "10"
+          NCCL_IGNORE_DISABLED_P2P: "1"
+          NCCL_DEBUG: "INFO"
         timeoutInMinutes: "10"
 
       - bash: |
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -13,6 +13,7 @@
 
 import torch
 from lightning.fabric.utilities.load import _NotYetLoadedTensor as NotYetLoadedTensor
+from lightning_utilities.core.imports import RequirementCache
 from safetensors.torch import load_file as load_safetensors
 from tqdm import tqdm
 
@@ -286,11 +287,16 @@ def copy_weights_gemma_2(
                 pbar.update(progress_per_file)
 
 
-GEMMA3_LANGUAGE_MODEL_PREFIX = "model.language_model"
+_TRANSFORMERS_GREATER_EQUAL_4_52 = RequirementCache("transformers>=4.52.0")
 
-GEMMA3_VISION_MODEL_PREFIX = "model.vision_tower"
+GEMMA3_LANGUAGE_MODEL_PREFIX = "model.language_model" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "language_model.model"
+
+GEMMA3_VISION_MODEL_PREFIX = "model.vision_tower" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "vision_tower"
+
+GEMMA3_MM_PROJECTOR_PREFIX = (
+    "model.multi_modal_projector" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "multi_modal_projector"
+)
 
-GEMMA3_MM_PROJECTOR_PREFIX = "model.multi_modal_projector"
 
 def copy_weights_gemma_3(
     qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
@@ -325,15 +331,14 @@ def copy_weights_gemma_3(
     if progress_per_file is not None:
         progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
     # gemma3 4b+ are multimodel models, but we are only loading the text weights
-    is_multimodal = any(k.startswith(GEMMA3_VISION_MODEL_PREFIX) for k in hf_weights)
+    is_multimodal = any(k.startswith(GEMMA3_LANGUAGE_MODEL_PREFIX) for k in hf_weights)
     if is_multimodal:
         warnings.warn("For Gemma3 models only the text component is supported.")
         new_weight_map = dict()
-        prefix = "model."
-        len_prefix = len(prefix)
+        prefix = "model"
         for k, v in weight_map.items():
             if k.startswith(prefix):
-                k = "model.language_model." + k[len_prefix:]
+                k = GEMMA3_LANGUAGE_MODEL_PREFIX + k[len(prefix) :]
             new_weight_map[k] = v
         weight_map = new_weight_map
     for from_name, param in hf_weights.items():