fix: gemma-3 checkpoint conversion from litgpt to hf (#2195)

adi776borate · bhimrazy · web-flow · commit f9f3dcbdcafc · 2026-01-23T15:55:31.000-08:00
Co-authored-by: Bhimraj Yadav &lt;bhimrajyadav977@gmail.com&gt;
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
@@ -170,7 +170,7 @@ def copy_weights_gemma_2(
     config: Config,
     state_dict: Dict[str, torch.Tensor],
     lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
-    untie_weights: bool = False,
+    untie_weights: bool = True,
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {
@@ -219,7 +219,7 @@ def copy_weights_gemma_3(
     config: Config,
     state_dict: Dict[str, torch.Tensor],
     lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
-    untie_weights: bool = False,
+    untie_weights: bool = True,
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {
@@ -557,6 +557,8 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
         copy_fn = partial(copy_weights_falcon, config)
     elif config.name.startswith("Gemma-2"):
         copy_fn = partial(copy_weights_gemma_2, config)
+    elif config.name.startswith("Gemma-3"):
+        copy_fn = partial(copy_weights_gemma_3, config)
     elif config.name.lower().startswith("phi"):
         copy_fn = partial(copy_weights_phi, config)
     elif config.name.lower().startswith(("qwen2.5", "qwq")):
diff --git a/tests/convert/test_lit_checkpoint.py b/tests/convert/test_lit_checkpoint.py
@@ -501,7 +501,7 @@ def test_against_original_gemma_2(model_name, device, dtype):
     ours_model.lm_head.weight = ours_model.transformer.wte.weight
     ours_state_dict = ours_model.state_dict()
     theirs_state_dict = {}
-    copy_weights_gemma_2(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True)
+    copy_weights_gemma_2(ours_config, theirs_state_dict, ours_state_dict)
     theirs_model = Gemma2ForCausalLM(theirs_config).to(device)
     keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
     assert not keys.unexpected_keys
@@ -574,7 +574,7 @@ def test_against_original_gemma_3(model_name, device, dtype):
     ours_model.lm_head.weight = ours_model.transformer.wte.weight
     ours_state_dict = ours_model.state_dict()
     theirs_state_dict = {}
-    copy_weights_gemma_3(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True)
+    copy_weights_gemma_3(ours_config, theirs_state_dict, ours_state_dict)
     theirs_model = Gemma3ForCausalLM(theirs_config).to(device)
     keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
     assert not keys.unexpected_keys