Allow HF and sentence-transformer models (#63)

VibhuJawa · web-flow · commit 29ddffdb8a75 · 2024-07-24T09:54:15.000-07:00
* Fix Bug to allow  both HF and sentence-transformer models

* Add tests
diff --git a/crossfit/backend/torch/hf/model.py b/crossfit/backend/torch/hf/model.py
@@ -25,6 +25,7 @@
 
 from crossfit.backend.torch.model import Model
 from crossfit.dataset.home import CF_HOME
+from crossfit.utils.model_adapter import adapt_model_input
 
 
 class HFModel(Model):
@@ -96,7 +97,7 @@ def fit_memory_estimate_curve(self, model=None):
                 }
 
                 try:
-                    _ = model(**batch)
+                    _ = adapt_model_input(model, batch)
                     memory_used = torch.cuda.max_memory_allocated() / (1024**2)  # Convert to MB
                     X.append([batch_size, seq_len, seq_len**2])
                     y.append(memory_used)
diff --git a/crossfit/backend/torch/loader.py b/crossfit/backend/torch/loader.py
@@ -22,6 +22,7 @@
 from crossfit.data.array.conversion import convert_array
 from crossfit.data.array.dispatch import crossarray
 from crossfit.data.dataframe.dispatch import CrossFrame
+from crossfit.utils.model_adapter import adapt_model_input
 
 DEFAULT_BATCH_SIZE = 512
 
@@ -70,7 +71,7 @@ def __next__(self):
         self.current_idx += self.batch_size
 
         for fn in self._to_map:
-            batch = fn(batch)
+            batch = adapt_model_input(fn, batch)
 
         if self.progress_bar is not None:
             self.progress_bar.update(batch_size)
@@ -141,7 +142,7 @@ def __next__(self):
                 batch = {key: val[:, :clip_len] for key, val in batch.items()}
 
                 for fn in self._to_map:
-                    batch = fn(batch)
+                    batch = adapt_model_input(fn, batch)
 
                 break
             except torch.cuda.OutOfMemoryError:
diff --git a/crossfit/backend/torch/op/base.py b/crossfit/backend/torch/op/base.py
@@ -74,7 +74,7 @@ def call(self, data, partition_info=None):
         for output in loader.map(self.model.get_model(self.get_worker())):
             if isinstance(output, dict):
                 if self.model_output_col not in output:
-                    raise ValueError(f"Column '{self.model_outupt_col}' not found in model output.")
+                    raise ValueError(f"Column '{self.model_output_col}' not found in model output.")
                 output = output[self.model_output_col]
 
             if self.post is not None:
diff --git a/crossfit/utils/model_adapter.py b/crossfit/utils/model_adapter.py
@@ -0,0 +1,23 @@
+from typing import Any, Callable
+
+
+def adapt_model_input(model: Callable, encoded_input: dict) -> Any:
+    """
+    Adapt the encoded input to the model, handling both single and multiple argument cases.
+
+    This function allows flexible calling of different model types:
+    - Models expecting keyword arguments (e.g., Hugging Face models)
+    - Models expecting a single dictionary input (e.g., Sentence Transformers)
+
+    :param model: The model function to apply
+    :param encoded_input: The encoded input to pass to the model
+    :return: The output of the model
+    """
+    try:
+        # First, try to call the model with keyword arguments
+        # For standard Hugging Face models
+        return model(**encoded_input)
+    except TypeError:
+        # If that fails, try calling it with a single argument
+        # This is useful for models like Sentence Transformers
+        return model(encoded_input)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,52 @@
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from crossfit.utils.model_adapter import adapt_model_input
+
+torch = pytest.importorskip("torch")
+sentence_transformers = pytest.importorskip("sentence_transformers")
+transformers = pytest.importorskip("transformers")
+
+
+def test_adapt_model_input_hf():
+    from transformers import AutoTokenizer, DistilBertModel
+
+    with torch.no_grad():
+        model_hf = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+        inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+
+        # Hugging Face model output
+        outputs_hf = model_hf(**inputs)
+        adapted_inputs_hf = adapt_model_input(model_hf, inputs)
+        assert torch.equal(adapted_inputs_hf.last_hidden_state, outputs_hf.last_hidden_state)
+
+
+def test_adapt_model_input_sentence_transformers():
+    from transformers import AutoTokenizer
+
+    with torch.no_grad():
+        model_st = sentence_transformers.SentenceTransformer("all-MiniLM-L6-v2").to("cpu")
+        tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+
+        inputs = tokenizer(
+            ["Hello", "my dog is cute"], return_tensors="pt", padding=True, truncation=True
+        )
+        # Sentence Transformers model output
+        expected_output = model_st(inputs)
+        adapted_output_st = adapt_model_input(model_st, inputs)
+
+        assert torch.equal(adapted_output_st.sentence_embedding, expected_output.sentence_embedding)