fix: add pooler weights to biencoder state dict adapter (#998)

HuiyingLi · web-flow · commit 4d319d493e1a · 2026-01-05T02:52:50.000-08:00
Signed-off-by: HuiyingLi &lt;willwin.lee@gmail.com&gt;
diff --git a/nemo_automodel/components/models/biencoder/state_dict_adapter.py b/nemo_automodel/components/models/biencoder/state_dict_adapter.py
@@ -48,6 +48,8 @@ def to_hf(self, state_dict: dict[str, Any], **kwargs) -> dict[str, Any]:
             if key.startswith("lm_q."):
                 new_key = key.replace("lm_q.", "model.")
                 hf_state_dict[new_key] = value
+            elif key.startswith("linear_pooler."):
+                hf_state_dict[key] = value
 
         return hf_state_dict
 
@@ -76,6 +78,8 @@ def from_hf(
                 biencoder_state_dict[new_key_q] = value
                 new_key_p = key.replace("model.", "lm_p.")
                 biencoder_state_dict[new_key_p] = value
+            elif key.startswith("linear_pooler."):
+                biencoder_state_dict[key] = value
 
         return biencoder_state_dict
 
@@ -94,6 +98,8 @@ def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[t
         if fqn.startswith("lm_q."):
             new_fqn = fqn.replace("lm_q.", "model.")
             return [(new_fqn, tensor)]
+        if fqn.startswith("linear_pooler."):
+            return [(fqn, tensor)]
 
         # Skip tensors that are not part of lm_q
         return []
diff --git a/tests/unit_tests/models/biencoder/test_state_dict_adapter.py b/tests/unit_tests/models/biencoder/test_state_dict_adapter.py
@@ -81,6 +81,22 @@ def test_to_hf_only_lm_q_keys(self, adapter):
         assert "model.layer1.weight" in hf_state_dict
         assert "model.layer1.bias" in hf_state_dict
 
+    def test_to_hf_includes_linear_pooler(self, adapter):
+        """Pooler weights should be retained during HF conversion."""
+        biencoder_state_dict = {
+            "lm_q.layer.weight": torch.randn(2, 2),
+            "linear_pooler.weight": torch.randn(4, 4),
+            "linear_pooler.bias": torch.randn(4),
+        }
+
+        hf_state_dict = adapter.to_hf(biencoder_state_dict)
+
+        assert "model.layer.weight" in hf_state_dict  # sanity for lm_q path
+        assert "linear_pooler.weight" in hf_state_dict
+        assert "linear_pooler.bias" in hf_state_dict
+        assert torch.equal(hf_state_dict["linear_pooler.weight"], biencoder_state_dict["linear_pooler.weight"])
+        assert torch.equal(hf_state_dict["linear_pooler.bias"], biencoder_state_dict["linear_pooler.bias"])
+
     def test_from_hf_basic(self, adapter):
         """Test basic conversion from HuggingFace to biencoder format."""
         hf_state_dict = {
@@ -103,6 +119,23 @@ def test_from_hf_basic(self, adapter):
         assert torch.equal(biencoder_state_dict["lm_q.layer2.bias"], hf_state_dict["model.layer2.bias"])
         assert torch.equal(biencoder_state_dict["lm_p.layer2.bias"], hf_state_dict["model.layer2.bias"])
 
+    def test_from_hf_includes_linear_pooler(self, adapter):
+        """Pooler weights should be retained when converting from HF."""
+        hf_state_dict = {
+            "model.layer.weight": torch.randn(2, 2),
+            "linear_pooler.weight": torch.randn(4, 4),
+            "linear_pooler.bias": torch.randn(4),
+        }
+
+        biencoder_state_dict = adapter.from_hf(hf_state_dict)
+
+        assert "lm_q.layer.weight" in biencoder_state_dict
+        assert "lm_p.layer.weight" in biencoder_state_dict
+        assert "linear_pooler.weight" in biencoder_state_dict
+        assert "linear_pooler.bias" in biencoder_state_dict
+        assert torch.equal(biencoder_state_dict["linear_pooler.weight"], hf_state_dict["linear_pooler.weight"])
+        assert torch.equal(biencoder_state_dict["linear_pooler.bias"], hf_state_dict["linear_pooler.bias"])
+
     def test_from_hf_empty_state_dict(self, adapter):
         """Test conversion with empty state dict."""
         biencoder_state_dict = adapter.from_hf({})
@@ -153,6 +186,15 @@ def test_convert_single_tensor_to_hf_other(self, adapter):
 
         assert result == []
 
+    def test_convert_single_tensor_to_hf_linear_pooler(self, adapter):
+        """Test converting linear_pooler tensor (should be passed through)."""
+        tensor = torch.randn(4)
+        result = adapter.convert_single_tensor_to_hf("linear_pooler.bias", tensor)
+
+        assert len(result) == 1
+        assert result[0][0] == "linear_pooler.bias"
+        assert torch.equal(result[0][1], tensor)
+
     def test_convert_single_tensor_to_hf_with_kwargs(self, adapter):
         """Test that convert_single_tensor_to_hf accepts kwargs."""
         tensor = torch.randn(10, 10)