fix: Biencoder consolidated checkpoint and transformers issue (#936)

meatybobby · web-flow · commit 5f27227ee7b4 · 2025-12-14T20:14:12.000-08:00
diff --git a/nemo_automodel/components/checkpoint/addons.py b/nemo_automodel/components/checkpoint/addons.py
@@ -253,9 +253,12 @@ def _maybe_save_custom_model_code(original_model_path: str | None, hf_metadata_d
     """
     if original_model_path is None:
         return
-    if not os.path.isdir(original_model_path):
+    if os.path.isfile(original_model_path):
+        pattern = original_model_path
+    elif os.path.isdir(original_model_path):
+        pattern = os.path.join(original_model_path, "**", "*.py")
+    else:
         return
-    pattern = os.path.join(original_model_path, "**", "*.py")
     for src_path in glob.glob(pattern, recursive=True):
         # Skip any .hidden paths
         rel_path = os.path.relpath(src_path, original_model_path)
diff --git a/nemo_automodel/components/models/biencoder/llama_bidirectional_model.py b/nemo_automodel/components/models/biencoder/llama_bidirectional_model.py
@@ -42,7 +42,14 @@
     LlamaForSequenceClassification,
     LlamaModel,
 )
-from transformers.utils import auto_docstring, can_return_tuple, logging
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, logging
+from transformers.utils.generic import check_model_inputs
+
+try:
+    from nemo_automodel.components.models.biencoder.state_dict_adapter import BiencoderStateDictAdapter
+except ImportError:
+    BiencoderStateDictAdapter = object
 
 logger = logging.get_logger(__name__)
 
@@ -170,7 +177,7 @@ def _update_causal_mask(
             return attention_mask
         return None
 
-    @can_return_tuple
+    @check_model_inputs
     @auto_docstring
     def forward(
         self,
@@ -179,40 +186,22 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        **flash_attn_kwargs,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
-        if not isinstance(past_key_values, (type(None), Cache)):
-            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
-
         if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
 
         if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
+            past_key_values = DynamicCache(config=self.config)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
+            cache_position: torch.Tensor = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
 
@@ -222,46 +211,23 @@ def forward(
         causal_mask = self._update_causal_mask(attention_mask=attention_mask)
 
         hidden_states = inputs_embeds
-
-        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
 
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
+            hidden_states = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
                 position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
+                past_key_values=past_key_values,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
-                **flash_attn_kwargs,
+                **kwargs,
             )
 
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
         hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
+            past_key_values=past_key_values,
         )
 
 
@@ -432,6 +398,15 @@ def __init__(
         self.config = self.lm_q.config
         self.trainer = None
 
+        # For HuggingFace consolidated checkpoint compatibility
+        self.name_or_path = os.path.abspath(__file__)
+        self.state_dict_adapter = BiencoderStateDictAdapter()
+        self.config.architectures = ["LlamaBidirectionalModel"]
+        self.config.auto_map = {
+            "AutoModel": "llama_bidirectional_model.LlamaBidirectionalModel",
+            "AutoConfig": "llama_bidirectional_model.LlamaBidirectionalConfig",
+        }
+
     def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = None):
         """Forward pass for training."""
 
diff --git a/nemo_automodel/components/models/biencoder/state_dict_adapter.py b/nemo_automodel/components/models/biencoder/state_dict_adapter.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+from torch.distributed.device_mesh import DeviceMesh
+
+from nemo_automodel.components.checkpoint.state_dict_adapter import StateDictAdapter
+
+
+class BiencoderStateDictAdapter(StateDictAdapter):
+    """Adapter for converting BiencoderModel state dict to single encoder format.
+
+    This adapter extracts only the query encoder (lm_q) state dict and converts
+    the "lm_q." prefix to "model." prefix, making it compatible with standard
+    HuggingFace model format.
+    """
+
+    def __init__(self):
+        """Initialize the adapter."""
+        self._uses_model_prefix = True
+
+    def to_hf(self, state_dict: dict[str, Any], **kwargs) -> dict[str, Any]:
+        """Convert from biencoder state dict to HuggingFace format.
+
+        Filters to only lm_q keys and converts "lm_q." prefix to "model." prefix.
+
+        Args:
+            state_dict: The biencoder model state dict
+
+        Returns:
+            The converted HuggingFace format state dict with only query encoder
+        """
+        hf_state_dict = {}
+
+        for key, value in state_dict.items():
+            if key.startswith("lm_q."):
+                new_key = key.replace("lm_q.", "model.")
+                hf_state_dict[new_key] = value
+
+        return hf_state_dict
+
+    def from_hf(
+        self,
+        hf_state_dict: dict[str, Any],
+        device_mesh: Optional["DeviceMesh"] = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Convert HuggingFace state dict to biencoder format.
+
+        Converts "model." prefix to "lm_q." prefix for loading into biencoder.
+
+        Args:
+            hf_state_dict: The HuggingFace format state dict
+            device_mesh: Optional device mesh (not used in this adapter)
+
+        Returns:
+            The converted biencoder format state dict
+        """
+        biencoder_state_dict = {}
+
+        for key, value in hf_state_dict.items():
+            if key.startswith("model."):
+                new_key_q = key.replace("model.", "lm_q.")
+                biencoder_state_dict[new_key_q] = value
+                new_key_p = key.replace("model.", "lm_p.")
+                biencoder_state_dict[new_key_p] = value
+
+        return biencoder_state_dict
+
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from biencoder format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in biencoder format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format.
+            Returns empty list if tensor is not part of lm_q.
+        """
+        if fqn.startswith("lm_q."):
+            new_fqn = fqn.replace("lm_q.", "model.")
+            return [(new_fqn, tensor)]
+
+        # Skip tensors that are not part of lm_q
+        return []
diff --git a/nemo_automodel/recipes/biencoder/train_biencoder.py b/nemo_automodel/recipes/biencoder/train_biencoder.py
@@ -433,6 +433,8 @@ def setup(self):
             self.model_parts = [model]
             self.pp = None
 
+        self.checkpointer.config.model_state_dict_keys = ["model." + k for k in model.lm_q.state_dict().keys()]
+
         # Build optimizer
         logger.info("Building optimizer...")
         trainable_params = list(filter(lambda x: x.requires_grad, self.model_parts[0].parameters()))
@@ -518,17 +520,24 @@ def run_train_validation_loop(self):
                 # Log metrics
                 self.log_train_metrics(train_log_data)
 
-                # Save checkpoint every ckpt_every_steps
-                if self.step_scheduler.is_ckpt_step:
-                    self.save_checkpoint(epoch, self.step_scheduler.step)
-
                 # Run validation every val_every_steps
+                val_loss = None
                 if self.step_scheduler.is_val_step and self.val_dataloader is not None:
                     val_log_data = self._run_validation_epoch(self.val_dataloader)
                     self.log_val_metrics(val_log_data)
+                    val_loss = {"val_loss": val_log_data.metrics["val_loss"]}
                     for mp in self.model_parts:
                         mp.train()
 
+                # Save checkpoint every ckpt_every_steps
+                if self.step_scheduler.is_ckpt_step:
+                    self.save_checkpoint(
+                        epoch,
+                        self.step_scheduler.step,
+                        train_loss=train_log_data.metrics["loss"],
+                        val_loss=val_loss,
+                    )
+
         # Close JSONL loggers after training loop completes
         self.metric_logger_train.close()
         self.metric_logger_valid.close()
@@ -611,7 +620,7 @@ def _run_train_optim_step(self, batches, max_grad_norm=None):
                 scheduler.step(1)
 
         # Compute average loss across gradient accumulation and DP ranks
-        reporting_loss = torch.sum(torch.stack(loss_buffer))
+        reporting_loss = torch.mean(torch.stack(loss_buffer))
         if torch.distributed.is_initialized():
             reporting_loss = self._dp_allreduce(reporting_loss, include_cp=True)
             # Divide by DP group size to get average across all ranks
diff --git a/tests/unit_tests/models/biencoder/test_llama_bidirectional_model.py b/tests/unit_tests/models/biencoder/test_llama_bidirectional_model.py
diff --git a/tests/unit_tests/models/biencoder/test_state_dict_adapter.py b/tests/unit_tests/models/biencoder/test_state_dict_adapter.py