Minor CrossFit improvements (#483)

sarahyurick · web-flow · commit 3483ee4e365e · 2025-02-18T10:30:55.000-08:00
Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/nemo_curator/classifiers/base.py b/nemo_curator/classifiers/base.py
@@ -123,10 +123,13 @@ def _run_classifier_helper(
     prob_col: str = None,
 ) -> "dask_cudf.DataFrame":
 
-    if prob_col:
-        df[prob_col] = 0
-    else:
+    if prob_col is None:
         prob_col = "_prob"
+        labeler = op.Labeler(labels, cols=[prob_col], suffix=label_col)
+    else:
+        labeler = op.Labeler(
+            labels, cols=[prob_col], keep_cols=[prob_col], suffix=label_col
+        )
 
     columns_to_keep_list = df.columns.to_list()
 
@@ -140,7 +143,7 @@ def _run_classifier_helper(
             batch_size=batch_size,
             pred_output_col=prob_col,
         ),
-        op.Labeler(labels, cols=[prob_col], suffix=label_col),
+        labeler,
         repartition=df.npartitions,
         keep_cols=columns_to_keep_list,
     )
diff --git a/nemo_curator/classifiers/prompt_task_complexity.py b/nemo_curator/classifiers/prompt_task_complexity.py
@@ -337,11 +337,15 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset:
 
         df = dataset.df
         columns_to_keep_list = df.columns.to_list()
-        df["sliced_text"] = df[self.text_field].str.slice(0, self.max_chars)
 
         model = self.model
         classifier_pipe = op.Sequential(
-            op.Tokenizer(model, cols=["sliced_text"], tokenizer_type="default"),
+            op.Tokenizer(
+                model,
+                cols=[self.text_field],
+                tokenizer_type="default",
+                max_chars=self.max_chars,
+            ),
             op.Predictor(
                 model,
                 sorted_data_loader=True,