Improve text embedding generation (#1064)

praateekmahajan · web-flow · commit 899398537429 · 2025-09-15T10:31:47.000-07:00
diff --git a/nemo_curator/stages/text/classifiers/aegis.py b/nemo_curator/stages/text/classifiers/aegis.py
@@ -85,7 +85,6 @@ def __init__(  # noqa: PLR0913
         local_files_only: bool = True,
         hf_token: str | bool | None = None,
         add_instruction_data_guard: bool = False,
-        autocast: bool = False,
     ):
         super().__init__()
 
@@ -107,7 +106,6 @@ def __init__(  # noqa: PLR0913
             cache_dir=cache_dir,
             local_files_only=local_files_only,
         )
-        self.autocast = autocast
         self.add_instruction_data_guard = add_instruction_data_guard
         if self.add_instruction_data_guard:
             self.instruction_data_guard_net = InstructionDataGuardNet(4096)
@@ -117,7 +115,7 @@ def device(self) -> torch.device:
         return next(self.parameters()).device
 
     @torch.no_grad()
-    def _forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
+    def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
         batch = {k: v.to(TORCH_DTYPE) if v.dtype.is_floating_point else v for k, v in batch.items()}
 
         if self.add_instruction_data_guard:
@@ -145,14 +143,6 @@ def _forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
 
             return response
 
-    @torch.no_grad()
-    def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
-        if self.autocast:
-            with torch.autocast(device_type="cuda"):
-                return self._forward(batch)
-        else:
-            return self._forward(batch)
-
 
 class AegisModelStage(ModelStage):
     """
@@ -179,12 +169,12 @@ def __init__(  # noqa: PLR0913
             has_seq_order=has_seq_order,
             padding_side=TOKENIZER_PADDING_SIDE,
             unpack_inference_batch=False,
+            autocast=autocast,
         )
 
         self.add_instruction_data_guard = add_instruction_data_guard
         self.pred_column = pred_column
         self.prob_column = prob_column
-        self.autocast = autocast
 
     def outputs(self) -> tuple[list[str], list[str]]:
         return ["data"], [self.pred_column] + ([self.prob_column] if self.add_instruction_data_guard else [])
@@ -199,7 +189,6 @@ def _setup(self, local_files_only: bool = True) -> None:
             local_files_only=local_files_only,
             hf_token=self.hf_token,
             add_instruction_data_guard=self.add_instruction_data_guard,
-            autocast=self.autocast,
         )
         if self.add_instruction_data_guard:
             self.model.instruction_data_guard_net = self.model.instruction_data_guard_net.from_pretrained(
diff --git a/nemo_curator/stages/text/classifiers/base.py b/nemo_curator/stages/text/classifiers/base.py
@@ -53,7 +53,7 @@ def device(self) -> torch.device:
         return next(self.parameters()).device
 
     @torch.no_grad()
-    def _forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
+    def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
         features = self.model(batch[INPUT_ID_COLUMN], batch[ATTENTION_MASK_COLUMN]).last_hidden_state
         dropped = self.dropout(features)
         outputs = self.fc(dropped)
@@ -62,17 +62,6 @@ def _forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
 
         return torch.softmax(outputs[:, 0, :], dim=1)
 
-    @torch.no_grad()
-    def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
-        if self.autocast:
-            with torch.autocast(device_type="cuda"):
-                return self._forward(batch)
-        else:
-            return self._forward(batch)
-
-    def set_autocast(self, autocast: bool) -> None:
-        self.autocast = autocast
-
 
 class ClassifierModelStage(ModelStage):
     """
@@ -109,6 +98,7 @@ def __init__(  # noqa: PLR0913
             model_inference_batch_size=model_inference_batch_size,
             padding_side=padding_side,
             unpack_inference_batch=False,
+            autocast=autocast,
         )
 
         self.pred_column = pred_column
@@ -118,16 +108,20 @@ def __init__(  # noqa: PLR0913
         else:
             self.prob_column = "probs"
             self.keep_prob_column = False
-        self.autocast = autocast
 
     def outputs(self) -> tuple[list[str], list[str]]:
         return ["data"], [self.pred_column] + ([self.prob_column] if self.keep_prob_column else [])
 
     def _setup(self, local_files_only: bool = True) -> None:
-        self.model = Deberta.from_pretrained(self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only).cuda().eval()
-        self.model.set_autocast(self.autocast)
+        self.model = (
+            Deberta.from_pretrained(self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only)
+            .cuda()
+            .eval()
+        )
 
-        config = AutoConfig.from_pretrained(self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only)
+        config = AutoConfig.from_pretrained(
+            self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only
+        )
         self.labels = list(config.label2id.keys())
         self.labels.sort(key=lambda x: config.label2id[x])
 
diff --git a/nemo_curator/stages/text/classifiers/fineweb_edu.py b/nemo_curator/stages/text/classifiers/fineweb_edu.py
@@ -84,19 +84,13 @@ def outputs(self) -> tuple[list[str], list[str]]:
         return ["data"], [self.pred_column, self.float_score_column, self.int_score_column]
 
     @staticmethod
-    def configure_forward(model: torch.nn.Module, autocast: bool = True) -> torch.nn.Module:
+    def configure_forward(model: torch.nn.Module) -> torch.nn.Module:
         original_forward = model.forward
 
         @torch.no_grad()
         def custom_forward(*args, **kwargs) -> torch.Tensor:
-            if autocast:
-                with torch.autocast(device_type="cuda"):
-                    output = original_forward(*args, **kwargs)
-            else:
-                output = original_forward(*args, **kwargs)
-
+            output = original_forward(*args, **kwargs)
             del args, kwargs
-
             return output.logits.squeeze(-1).float()
 
         model.forward = custom_forward
@@ -108,9 +102,11 @@ def _setup(self, local_files_only: bool = True) -> None:
             cache_dir=self.cache_dir,
             local_files_only=local_files_only,
         ).cuda()
-        self.model = self.configure_forward(model, self.autocast)
+        self.model = self.configure_forward(model)
 
-    def process_model_output(self, outputs: torch.Tensor, _: dict[str, torch.Tensor] | None = None) -> dict[str, np.ndarray]:
+    def process_model_output(
+        self, outputs: torch.Tensor, _: dict[str, torch.Tensor] | None = None
+    ) -> dict[str, np.ndarray]:
         logits = outputs.cpu().numpy()
 
         float_scores = logits.tolist()
diff --git a/nemo_curator/stages/text/classifiers/prompt_task_complexity.py b/nemo_curator/stages/text/classifiers/prompt_task_complexity.py
@@ -210,14 +210,7 @@ def forward(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         input_ids = batch[INPUT_ID_COLUMN]
         attention_mask = batch[ATTENTION_MASK_COLUMN]
 
-        if self.autocast:
-            with torch.autocast(device_type="cuda"):
-                return self._forward(input_ids, attention_mask)
-        else:
-            return self._forward(input_ids, attention_mask)
-
-    def set_autocast(self, autocast: bool) -> None:
-        self.autocast = autocast
+        return self._forward(input_ids, attention_mask)
 
 
 class PromptTaskComplexityModelStage(ModelStage):
@@ -256,12 +249,15 @@ def outputs(self) -> tuple[list[str], list[str]]:
         return ["data"], OUTPUT_COLUMNS
 
     def _setup(self, local_files_only: bool = True) -> None:
-        self.model = CustomDeberta.from_pretrained(
-            self.model_identifier,
-            cache_dir=self.cache_dir,
-            local_files_only=local_files_only,
-        ).cuda().eval()
-        self.model.set_autocast(self.autocast)
+        self.model = (
+            CustomDeberta.from_pretrained(
+                self.model_identifier,
+                cache_dir=self.cache_dir,
+                local_files_only=local_files_only,
+            )
+            .cuda()
+            .eval()
+        )
 
     def process_model_output(self, outputs: torch.Tensor, _: dict[str, torch.Tensor] | None = None) -> torch.Tensor:
         return outputs
diff --git a/nemo_curator/stages/text/embedders/base.py b/nemo_curator/stages/text/embedders/base.py
@@ -15,8 +15,6 @@
 from dataclasses import dataclass
 from typing import Literal
 
-import cudf
-import cupy as cp
 import pandas as pd
 import torch
 import torch.nn.functional as F  # noqa: N812
@@ -29,8 +27,6 @@
 from nemo_curator.stages.text.models.utils import ATTENTION_MASK_COLUMN
 from nemo_curator.tasks import DocumentBatch
 
-from .utils import create_list_series_from_1d_or_2d_ar
-
 
 class EmbeddingModelStage(ModelStage):
     """HuggingFace model stage that produces embeddings with pooling."""
@@ -41,9 +37,10 @@ def __init__(  # noqa: PLR0913
         embedding_field: str = "embeddings",
         pooling: Literal["mean_pooling", "last_token"] = "mean_pooling",
         hf_token: str | None = None,
-        model_inference_batch_size: int = 256,
+        model_inference_batch_size: int = 1024,
         has_seq_order: bool = True,
         padding_side: Literal["left", "right"] = "right",
+        autocast: bool = True,
     ):
         super().__init__(
             model_identifier=model_identifier,
@@ -52,6 +49,7 @@ def __init__(  # noqa: PLR0913
             has_seq_order=has_seq_order,
             padding_side=padding_side,
             unpack_inference_batch=True,
+            autocast=autocast,
         )
         self.embedding_field = embedding_field
         self.pooling = pooling
@@ -62,33 +60,23 @@ def outputs(self) -> tuple[list[str], list[str]]:
     def setup(self, _: WorkerMetadata | None = None) -> None:
         """Load the model for inference."""
         self.model = AutoModel.from_pretrained(self.model_identifier, local_files_only=True)
-        self.model.eval()
-        self.model.to("cuda")
+        self.model.eval().to("cuda")
 
     def process_model_output(
         self, outputs: torch.Tensor, model_input_batch: dict[str, torch.Tensor] | None = None
     ) -> torch.Tensor:
         """Process model outputs to create embeddings."""
         if self.pooling == "mean_pooling":
-            return self._mean_pooling(outputs, model_input_batch[ATTENTION_MASK_COLUMN])
+            return self._mean_pooling(outputs, model_input_batch[ATTENTION_MASK_COLUMN]).cpu()
         else:
-            return self._get_last_token(outputs, model_input_batch[ATTENTION_MASK_COLUMN])
+            return self._get_last_token(outputs, model_input_batch[ATTENTION_MASK_COLUMN]).cpu()
 
-    def collect_outputs(self, processed_outputs: list[torch.Tensor]) -> cp.ndarray:
-        """Collect embeddings into a cupy array."""
-        # TODO : benchmarking this and maybe stay in cpu land
-        cupy_array_embeddings = [cp.asarray(emb_chunk) for emb_chunk in processed_outputs]
-        return cp.concatenate(cupy_array_embeddings, axis=0)
+    def collect_outputs(self, processed_outputs: list[torch.Tensor]) -> list[list[float]]:
+        return torch.cat(processed_outputs, dim=0).numpy().tolist()
 
-    def create_output_dataframe(self, df_cpu: pd.DataFrame, collected_output: cp.ndarray) -> pd.DataFrame:
+    def create_output_dataframe(self, df_cpu: pd.DataFrame, collected_output: list[list[float]]) -> pd.DataFrame:
         """Create output dataframe with embeddings."""
-        # TODO: Consider if it even makes sense to goto cudf or just concat in numpy
-        df_gpu = cudf.DataFrame(index=df_cpu.index)
-        df_gpu[self.embedding_field] = create_list_series_from_1d_or_2d_ar(collected_output, index=df_gpu.index)
-        # Add embedding_field back to cpu dataframe
-        df_cpu[self.embedding_field] = df_gpu[self.embedding_field].to_pandas()
-        del df_gpu
-        return df_cpu
+        return df_cpu.assign(**{self.embedding_field: collected_output})
 
     def _mean_pooling(self, model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
         token_embeddings = model_output[0]
@@ -119,7 +107,8 @@ class EmbeddingCreatorStage(CompositeStage[DocumentBatch, DocumentBatch]):
     max_seq_length: int | None = None
     padding_side: Literal["left", "right"] = "right"
     embedding_pooling: Literal["mean_pooling", "last_token"] = "mean_pooling"
-    model_inference_batch_size: int = 256
+    model_inference_batch_size: int = 1024
+    autocast: bool = True
     sort_by_length: bool = True
     hf_token: str | None = None
 
@@ -144,6 +133,7 @@ def __post_init__(self) -> None:
                 model_inference_batch_size=self.model_inference_batch_size,
                 has_seq_order=self.sort_by_length,
                 padding_side=self.padding_side,
+                autocast=self.autocast,
             ),
         ]
 
diff --git a/nemo_curator/stages/text/models/model.py b/nemo_curator/stages/text/models/model.py
@@ -46,6 +46,8 @@ class ModelStage(ProcessingStage[DocumentBatch, DocumentBatch]):
             Sorting is encouraged to improve the performance of the inference model. Defaults to True.
         padding_side: The side to pad the input tokens. Defaults to "right".
         unpack_inference_batch: Whether to unpack the inference batch with **kwargs. Defaults to False.
+        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
+            Defaults to True.
 
     """
 
@@ -58,6 +60,7 @@ def __init__(  # noqa: PLR0913
         has_seq_order: bool = True,
         padding_side: Literal["left", "right"] = "right",
         unpack_inference_batch: bool = False,
+        autocast: bool = True,
     ):
         self._name = format_name_with_suffix(model_identifier, suffix="_model")
         # Assume that the model can fit on a single GPU
@@ -70,6 +73,7 @@ def __init__(  # noqa: PLR0913
         self.has_seq_order = has_seq_order
         self.padding_side = padding_side
         self.unpack_inference_batch = unpack_inference_batch
+        self.autocast = autocast
 
     def inputs(self) -> tuple[list[str], list[str]]:
         return ["data"], [INPUT_ID_COLUMN, ATTENTION_MASK_COLUMN] + ([SEQ_ORDER_COLUMN] if self.has_seq_order else [])
@@ -147,17 +151,24 @@ def create_output_dataframe(self, df_cpu: pd.DataFrame, collected_output: dict[s
         msg = "Subclasses must implement this method"
         raise NotImplementedError(msg)
 
+    def _model_forward(self, model_input_batch: dict[str, torch.Tensor]) -> torch.Tensor:
+        if self.unpack_inference_batch:
+            return self.model(**model_input_batch)
+        else:
+            return self.model(model_input_batch)
+
     def process(self, batch: DocumentBatch) -> DocumentBatch:
         df_cpu = batch.to_pandas()
 
         processed_outputs = []
         for model_input_batch in self.yield_next_batch(df_cpu):
             # Forward pass
             with torch.no_grad():
-                if self.unpack_inference_batch:
-                    outputs = self.model(**model_input_batch)
+                if self.autocast:
+                    with torch.autocast(device_type="cuda"):
+                        outputs = self._model_forward(model_input_batch)
                 else:
-                    outputs = self.model(model_input_batch)
+                    outputs = self._model_forward(model_input_batch)
 
             processed_output = self.process_model_output(outputs, model_input_batch)
             del model_input_batch
diff --git a/tests/stages/text/embedders/test_base.py b/tests/stages/text/embedders/test_base.py
@@ -292,15 +292,17 @@ def test_embedding_creator_stage_process_integration(self) -> None:
         assert embedding_stage.pooling == stage.embedding_pooling
 
     @pytest.mark.parametrize("pooling_strategy", ["mean_pooling", "last_token"])
+    @pytest.mark.parametrize("autocast", [True, False])
     @pytest.mark.gpu
     def test_embedding_creator_stage_with_reference_embeddings(
-        self, pooling_strategy: str, sample_data: DocumentBatch
+        self, pooling_strategy: str, sample_data: DocumentBatch, autocast: bool
     ) -> None:
         """Test embeddings match reference implementation (requires GPU and model download)."""
         stage = EmbeddingCreatorStage(
             model_identifier="sentence-transformers/all-MiniLM-L6-v2",
             embedding_pooling=pooling_strategy,
             model_inference_batch_size=32,
+            autocast=autocast,
         )
 
         # Decompose and setup stages
@@ -362,7 +364,7 @@ def _get_reference_embeddings(
             )
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
-            with torch.no_grad(), torch.autocast(device_type="cuda"):
+            with torch.no_grad():
                 outputs = model(**inputs)
 
             if pooling_strategy == "last_token":