Update on "[2/N] Support lora checkpoint on partial save and multi-source load"

mori360 · mori360 · commit d4f1e3cf6606 · 2026-03-20T15:17:15.000-07:00
### Summary - **`merge_adapter` config**: Replace `save_format: str` with `merge_adapter: bool` on `LoRAConverter.Config`. When `True`, adapters are folded into base weights (`base + alpha/rank * B @ A`) at end of training. When `False` (default), adapter weights are saved separately — use `checkpoint.last_save_in_hf=True` to save in HuggingFace PEFT format. - **`finalize()` lifecycle on ModelConverter protocol**: Add an end-of-training hook called before the last checkpoint save. `ModelConvertersContainer` runs finalize in reverse converter order (LoRA merge before quantization CONVERT). A `converter_finalize_fn` closure is attached to each model part during `convert()` so the checkpoint system can invoke it. All existing converters (Float8Linear, Float8GroupedMM, MXFP8) get no-op `finalize()` implementations. - **Checkpoint integration**: `ModelWrapper` gains converter-aware methods (`state_dict_to_save`, `export_state_dict`, `base_state_dict`, `has_converter_keys`, `converter_save_last_fn`, `converter_load_additional_fn`) to support adapter-only checkpointing. `dcp_load` accepts `checkpoint_ids: list[str]` for loading from multiple sources (base model + adapter weights). A new `additional_load_paths` config field enables multi-source loading. The PEFT save path in `_save_last_step` is gated on `last_save_in_hf`. - **LoRA PEFT save/load**: `_make_peft_save_fn()` writes `adapter_model.safetensors` + `adapter_config.json` with HF PEFT key naming. `_make_peft_load_fn()` loads and remaps keys back. `remap_lora_keys_to_hf()` / `remap_lora_keys_from_hf()` handle the bidirectional key translation. - **Configs**: Add `llama3_8b_lora` (rank=128, alpha=32, `last_save_in_hf=True`). Update `llama3_debugmodel_lora` with checkpoint settings for proper resumption. ### Test plan - [ ] `pytest tests/unit_tests/test_checkpoint.py -x` — new `TestModelWrapperConverterKeys` tests (strict vs partial planner) - [ ] `pytest tests/unit_tests/test_model_converter.py -x` — new `test_lora_key_remap_roundtrip` - [ ] `torchrun --nproc_per_node=4 -m torchtitan.train --module llama3 --config llama3_debugmodel_lora` — verify LoRA training runs end-to-end - [ ] `torchrun --nproc_per_node=4 -m torchtitan.train --module llama3 --config llama3_8b_lora` — verify 8B LoRA config with PEFT save * #2484 [ghstack-poisoned]
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -470,6 +470,12 @@ def load_state_dict(state_dict):
                 sd_adapter is not None
             ), "checkpoint.last_save_in_hf is True, but sd_adapter is not provided."
         self.sd_adapter = sd_adapter
+
+        # Inject from_hf_map into converter hooks so save/load fns can remap keys
+        hooks = self.states[MODEL]._get_hooks()
+        if hooks is not None and sd_adapter is not None:
+            hooks.from_hf_map = getattr(sd_adapter, "from_hf_map", None)
+
         self.export_dtype = TORCH_DTYPE_MAP[config.export_dtype]
         self.exclude_from_loading = config.exclude_from_loading
         self.additional_load_paths = config.additional_load_paths
@@ -684,7 +690,7 @@ def dcp_load(
                     load_fn(
                         cid,
                         self.states[MODEL].model,
-                        self._get_from_hf_map(),
+                        hooks,
                     )
                 else:
                     # DCP: load all available states (model + training info).
@@ -1042,7 +1048,7 @@ def _save_last_step(self, curr_step: int) -> None:
             and save_last_fn is not None
         ):
             checkpoint_dir = self._create_checkpoint_id(curr_step)
-            save_last_fn(states, checkpoint_dir, self._get_from_hf_map())
+            save_last_fn(states, checkpoint_dir, hooks)
             return
 
         self.dcp_save(
@@ -1053,12 +1059,6 @@ def _save_last_step(self, curr_step: int) -> None:
             to_hf=self.last_save_in_hf,
         )
 
-    def _get_from_hf_map(self) -> dict[str, str | None] | None:
-        """Return from_hf_map from sd_adapter, or None if unavailable."""
-        if self.sd_adapter is None:
-            return None
-        return getattr(self.sd_adapter, "from_hf_map", None)
-
     def _should_save(self, curr_step: int, last_step: bool = False) -> bool:
         if not self.enable or self.load_only:
             return False
diff --git a/torchtitan/components/lora.py b/torchtitan/components/lora.py
@@ -119,7 +119,7 @@ def _save_peft(
         self,
         state_dict: dict[str, Any],
         checkpoint_dir: str,
-        from_hf_map: dict[str, str | None] | None,
+        hooks: "ConverterCheckpointHooks",
     ) -> None:
         """Save adapter weights in PEFT format.
 
@@ -138,8 +138,8 @@ def _save_peft(
                 cpu_states[k] = v.cpu() if isinstance(v, torch.Tensor) else v
 
         # Remap keys to HF PEFT naming
-        if from_hf_map is not None:
-            hf_states = remap_lora_keys_to_hf(cpu_states, from_hf_map)
+        if hooks.from_hf_map is not None:
+            hf_states = remap_lora_keys_to_hf(cpu_states, hooks.from_hf_map)
         else:
             logger.warning(
                 "No from_hf_map available; saving PEFT with torchtitan keys."
@@ -181,7 +181,7 @@ def _load_peft(
         self,
         path: str,
         model_parts: list[nn.Module],
-        from_hf_map: dict[str, str | None] | None,
+        hooks: "ConverterCheckpointHooks",
     ) -> None:
         """Load adapter weights from a PEFT directory.
 
@@ -194,8 +194,8 @@ def _load_peft(
 
         safetensors_path = os.path.join(path, "adapter_model.safetensors")
         adapter_sd = load_file(safetensors_path)
-        if from_hf_map is not None:
-            adapter_sd = remap_lora_keys_from_hf(adapter_sd, from_hf_map)
+        if hooks.from_hf_map is not None:
+            adapter_sd = remap_lora_keys_from_hf(adapter_sd, hooks.from_hf_map)
         func = functools.partial(
             set_model_state_dict,
             model_state_dict=adapter_sd,
@@ -243,8 +243,13 @@ def finalize(self, model: nn.Module) -> None:
         for name, mod in list(model.named_modules()):
             if not (hasattr(mod, "lora_a") and hasattr(mod, "lora_b")):
                 continue
+            assert isinstance(mod, nn.Linear)
+            lora_a = mod.lora_a
+            lora_b = mod.lora_b
+            assert isinstance(lora_a, nn.Linear)
+            assert isinstance(lora_b, nn.Linear)
             with torch.no_grad():
-                mod.weight.add_(scaling * (mod.lora_b.weight @ mod.lora_a.weight))
+                mod.weight.add_(scaling * (lora_b.weight @ lora_a.weight))
             del mod.lora_a, mod.lora_b
             if hasattr(mod, "_lora_scaling"):
                 del mod._lora_scaling
diff --git a/torchtitan/protocols/model_converter.py b/torchtitan/protocols/model_converter.py
@@ -23,6 +23,7 @@ class ConverterCheckpointHooks:
     save_last_fn: Callable | None = None
     load_additional_fn: Callable | None = None
     finalize_fn: Callable | None = None
+    from_hf_map: dict[str, str | None] | None = None
 
 
 class ModelConverter(Protocol):