Add more Python type annotations to cudf/core (rapidsai#20287)

mroeschke · web-flow · commit b260f08fa256 · 2025-10-17T17:22:52.000Z
Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: rapidsai#20287
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -605,6 +605,7 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "StringColumn"),
     ("py:class", "ColumnLike"),
     ("py:class", "DtypeObj"),
+    ("py:class", "Axis"),
     ("py:class", "ArrowLike"),
 ]
 
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
@@ -2,7 +2,7 @@
 
 import sys
 from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, Any, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -22,7 +22,9 @@
 # Many of these are from
 # https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py
 
+# Dtype should ideally only used for public facing APIs
 Dtype = Union["ExtensionDtype", str, np.dtype]
+# DtypeObj should be used otherwise
 DtypeObj = Union["ExtensionDtype", np.dtype]
 
 # scalars
@@ -46,3 +48,5 @@
 MultiColumnAggType = Union[  # noqa: UP007
     AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
+
+Axis = Literal[0, 1, "index", "columns"]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -3365,7 +3365,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
     return columns
 
 
-def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
+def concat_columns(objs: Sequence[ColumnBase]) -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
         return column_empty(0, dtype=np.dtype(np.float64))
@@ -3386,30 +3386,33 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Find the first non-null column:
     head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0])
 
-    for i, obj in enumerate(objs):
+    new_objs = list(objs)
+    for i, obj in enumerate(new_objs):
         # Check that all columns are the same type:
         if not is_dtype_equal(obj.dtype, head.dtype):
             # if all null, cast to appropriate dtype
             if obj.null_count == len(obj):
-                objs[i] = column_empty(row_count=len(obj), dtype=head.dtype)
+                new_objs[i] = column_empty(
+                    row_count=len(obj), dtype=head.dtype
+                )
             else:
                 raise ValueError("All columns must be the same type")
 
     # TODO: This logic should be generalized to a dispatch to
     # ColumnBase._concat so that all subclasses can override necessary
     # behavior. However, at the moment it's not clear what that API should look
     # like, so CategoricalColumn simply implements a minimal working API.
-    if all(isinstance(o.dtype, CategoricalDtype) for o in objs):
+    if all(isinstance(o.dtype, CategoricalDtype) for o in new_objs):
         return cudf.core.column.categorical.CategoricalColumn._concat(
             cast(
                 MutableSequence[
                     cudf.core.column.categorical.CategoricalColumn
                 ],
-                objs,
+                new_objs,
             )
         )
 
-    newsize = sum(map(len, objs))
+    newsize = sum(map(len, new_objs))
     if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
         raise MemoryError(
             f"Result of concat cannot have size > {SIZE_TYPE_DTYPE}_MAX"
@@ -3418,7 +3421,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         return column_empty(0, head.dtype)
 
     # Filter out inputs that have 0 length, then concatenate.
-    objs_with_len = [o for o in objs if len(o)]
+    objs_with_len = [o for o in new_objs if len(o)]
     with acquire_spill_lock():
         return ColumnBase.from_pylibcudf(
             plc.concatenate.concatenate(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -124,7 +124,13 @@
 )
 
 if TYPE_CHECKING:
-    from cudf._typing import ColumnLike, Dtype, NotImplementedType
+    from cudf._typing import (
+        Axis,
+        ColumnLike,
+        Dtype,
+        NotImplementedType,
+        ScalarLike,
+    )
 
 _cupy_nan_methods_map = {
     "min": "nanmin",
@@ -6517,11 +6523,11 @@ def count(self, axis=0, numeric_only=False):
     @_performance_tracking
     def _reduce(
         self,
-        op,
+        op: str,
         axis=None,
-        numeric_only=False,
+        numeric_only: bool = False,
         **kwargs,
-    ):
+    ) -> ScalarLike:
         source = self
 
         if axis is None:
@@ -6597,13 +6603,15 @@ def _reduce(
                 )(**kwargs)
             else:
                 source_dtypes = [dtype for _, dtype in source._dtypes]
+                # TODO: What happens if common_dtype is None?
                 common_dtype = find_common_type(source_dtypes)
                 if (
                     common_dtype == CUDF_STRING_DTYPE
                     and any(
                         dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
                     )
-                    or common_dtype.kind != "b"
+                    or common_dtype is not None
+                    and common_dtype.kind != "b"
                     and any(dtype.kind == "b" for dtype in source_dtypes)
                 ):
                     raise TypeError(
@@ -6622,15 +6630,21 @@ def _reduce(
                     if res.isnull().all():
                         if cudf.api.types.is_numeric_dtype(common_dtype):
                             if op in {"sum", "product"}:
-                                if common_dtype.kind == "f":
+                                if (
+                                    common_dtype is not None
+                                    and common_dtype.kind == "f"
+                                ):
                                     res_dtype = (
                                         np.dtype("float64")
                                         if isinstance(
                                             common_dtype, pd.ArrowDtype
                                         )
                                         else common_dtype
                                     )
-                                elif common_dtype.kind == "u":
+                                elif (
+                                    common_dtype is not None
+                                    and common_dtype.kind == "u"
+                                ):
                                     res_dtype = np.dtype("uint64")
                                 else:
                                     res_dtype = np.dtype("int64")
@@ -6645,7 +6659,10 @@ def _reduce(
                                 "skew",
                                 "median",
                             }:
-                                if common_dtype.kind == "f":
+                                if (
+                                    common_dtype is not None
+                                    and common_dtype.kind == "f"
+                                ):
                                     res_dtype = (
                                         np.dtype("float64")
                                         if isinstance(
@@ -6668,19 +6685,22 @@ def _reduce(
     @_performance_tracking
     def _scan(
         self,
-        op,
-        axis=None,
+        op: str,
+        axis: Axis | None = None,
+        skipna: bool = True,
         *args,
         **kwargs,
-    ):
+    ) -> Self:
         if axis is None:
             axis = 0
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
-            return super()._scan(op, axis=axis, *args, **kwargs)
+            return super()._scan(op, axis=axis, skipna=skipna, *args, **kwargs)
         elif axis == 1:
-            return self._apply_cupy_method_axis_1(op, **kwargs)
+            return self._apply_cupy_method_axis_1(op, skipna=skipna, **kwargs)
+        else:
+            raise ValueError(f"{axis=} should be None, 0 or 1")
 
     @_performance_tracking
     def mode(self, axis=0, numeric_only=False, dropna=True):
@@ -6808,7 +6828,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         return super(DataFrame, obj).any(axis, skipna, **kwargs)
 
     @_performance_tracking
-    def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
+    def _apply_cupy_method_axis_1(self, method: str, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
         # operations, we convert DataFrames to 2D cupy arrays for these ops.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -51,7 +51,7 @@
     )
     from types import ModuleType
 
-    from cudf._typing import Dtype, DtypeObj, ScalarLike
+    from cudf._typing import Axis, Dtype, DtypeObj, ScalarLike
     from cudf.core.series import Series
 
 
@@ -92,7 +92,7 @@ def _column_labels_and_values(
         return zip(self._column_names, self._columns, strict=True)
 
     @property
-    def _dtypes(self) -> Generator[tuple[Hashable, Dtype], None, None]:
+    def _dtypes(self) -> Generator[tuple[Hashable, DtypeObj], None, None]:
         for label, col in self._column_labels_and_values:
             yield label, col.dtype
 
@@ -1772,9 +1772,28 @@ def __bool__(self) -> None:
         )
 
     @_performance_tracking
-    def _reduce(self, *args, **kwargs):
+    def _reduce(
+        self,
+        op: str,
+        axis=no_default,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> ScalarLike:
+        raise NotImplementedError(
+            f"Reductions are not supported for objects of type {type(self).__name__}."
+        )
+
+    @_performance_tracking
+    def _scan(
+        self,
+        op: str,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        *args,
+        **kwargs,
+    ) -> Self:
         raise NotImplementedError(
-            f"Reductions are not supported for objects of type {type(self)}."
+            f"Scans are not supported for objects of type {type(self).__name__}."
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -2185,14 +2185,6 @@ def __getitem__(self, index):
             res = Index._from_column(res, name=self.name)
         return res
 
-    @property  # type: ignore
-    @_performance_tracking
-    def dtype(self):
-        """
-        `dtype` of the underlying values in Index.
-        """
-        return self._column.dtype
-
     @_performance_tracking
     def isna(self) -> cupy.ndarray:
         return self._column.isnull().values
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -92,6 +92,7 @@
     )
 
     from cudf._typing import (
+        Axis,
         ColumnLike,
         DataFrameOrSeries,
         Dtype,
@@ -448,7 +449,14 @@ def _mimic_inplace(
         return super()._mimic_inplace(result, inplace)
 
     @_performance_tracking
-    def _scan(self, op, axis=None, skipna=True):
+    def _scan(
+        self,
+        op: str,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        *args,
+        **kwargs,
+    ) -> Self:
         """
         Return {op_name} of the {cls}.
 
@@ -488,6 +496,10 @@ def _scan(self, op, axis=None, skipna=True):
         2   6  24
         3  10  34
         """
+        if "numeric_only" in kwargs:
+            raise TypeError(
+                "got an unexpected keyword argument 'numeric_only'"
+            )
         cast_to_int = op in ("cumsum", "cumprod")
         skipna = True if skipna is None else skipna
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1475,12 +1475,6 @@ def list(self):
     def struct(self):
         return StructMethods(parent=self)
 
-    @property  # type: ignore
-    @_performance_tracking
-    def dtype(self):
-        """The dtype of the Series."""
-        return self._column.dtype
-
     @property  # type: ignore
     @_performance_tracking
     def dtypes(self):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
diff --git a/python/cudf/cudf/core/wordpiece_tokenize.py b/python/cudf/cudf/core/wordpiece_tokenize.py

Original file line number	Diff line number	Diff line change
`@@ -605,6 +605,7 @@ def on_missing_reference(app, env, node, contnode):`
`605`	`605`	`("py:class", "StringColumn"),`
`606`	`606`	`("py:class", "ColumnLike"),`
`607`	`607`	`("py:class", "DtypeObj"),`
	`608`	`+ ("py:class", "Axis"),`
`608`	`609`	`("py:class", "ArrowLike"),`
`609`	`610`	`]`
`610`	`611`