add option to choose axis in labeler (#48)

edknv · web-flow · commit 06afeeee9741 · 2024-01-23T15:08:34.000-08:00
* add option to choose axis in labeler

* use cupy argmax

* lint

* fix test case to use series
diff --git a/crossfit/op/label.py b/crossfit/op/label.py
@@ -1,6 +1,7 @@
 from typing import List, Union
 
 import cudf
+import cupy as cp
 
 from crossfit.op.base import Op
 
@@ -12,29 +13,27 @@ def __init__(
         cols=None,
         keep_cols=None,
         pre=None,
-        keep_prob: bool = False,
         suffix: str = "labels",
+        axis=-1,
     ):
         super().__init__(pre=pre, cols=cols, keep_cols=keep_cols)
         self.labels = labels
-        self.keep_prob = keep_prob
         self.suffix = suffix
+        self.axis = axis
 
     def call_column(self, data: cudf.Series) -> cudf.Series:
         if isinstance(data, cudf.DataFrame):
             raise ValueError(
                 "data must be a Series, got DataFrame. Add a pre step to convert to Series"
             )
 
-        num_labels = len(data.iloc[0])
-        if len(self.labels) != num_labels:
-            raise ValueError(
-                f"The number of provided labels is {len(self.labels)} "
-                f"but there are {num_labels} in data."
-            )
+        shape = (data.size,) + cp.asarray(data.iloc[0]).shape
+        scores = data.list.leaves.values.reshape(shape)
+        classes = scores.argmax(self.axis)
+
+        if len(classes.shape) > 1:
+            raise RuntimeError(f"Max category of the axis {self.axis} of data is not a 1-d array.")
 
-        scores = data.list.leaves.values.reshape(-1, num_labels)
-        classes = scores.argmax(-1)
         labels_map = {i: self.labels[i] for i in range(len(self.labels))}
 
         return cudf.Series(classes).map(labels_map)
@@ -60,7 +59,7 @@ def call(self, data: Union[cudf.Series, cudf.DataFrame]) -> Union[cudf.Series, c
     def meta(self):
         labeled = {"labels": "string"}
 
-        if len(self.cols) > 1:
+        if self.cols and len(self.cols) > 1:
             labeled = {
                 self._construct_name(col, suffix): dtype
                 for col in self.cols
diff --git a/tests/op/test_label.py b/tests/op/test_label.py
@@ -0,0 +1,35 @@
+import pytest
+
+cudf = pytest.importorskip("cudf")
+
+import crossfit as cf  # noqa: E402
+
+
+def test_labeler_basic():
+    df = cudf.Series(
+        [
+            [0.1, 0.2, 0.5],
+            [0.2, 0.1, 0.3],
+            [0.3, 0.2, 0.1],
+            [0.2, 0.3, 0.1],
+        ]
+    )
+    labeler = cf.op.Labeler(list("abc"))
+    results = labeler(df)
+
+    assert results.to_pandas().values.tolist() == ["c", "c", "a", "b"]
+
+
+def test_labeler_first_axis():
+    df = cudf.Series(
+        [
+            [0.1, 0.2, 0.5],
+            [0.2, 0.1, 0.3],
+            [0.3, 0.2, 0.1],
+            [0.2, 0.3, 0.1],
+        ]
+    )
+    labeler = cf.op.Labeler(list("abcd"), axis=0)
+    results = labeler(df)
+
+    assert results.to_pandas().values.tolist() == ["c", "d", "a"]