Skip to content

Commit b260f08

Browse files
authored
Add more Python type annotations to cudf/core (rapidsai#20287)
Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: rapidsai#20287
1 parent 56eb2e7 commit b260f08

File tree

11 files changed

+111
-54
lines changed

11 files changed

+111
-54
lines changed

docs/cudf/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ def on_missing_reference(app, env, node, contnode):
605605
("py:class", "StringColumn"),
606606
("py:class", "ColumnLike"),
607607
("py:class", "DtypeObj"),
608+
("py:class", "Axis"),
608609
("py:class", "ArrowLike"),
609610
]
610611

python/cudf/cudf/_typing.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import sys
44
from collections.abc import Callable, Iterable
5-
from typing import TYPE_CHECKING, Any, TypeVar, Union
5+
from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union
66

77
import numpy as np
88
from pandas import Period, Timedelta, Timestamp
@@ -22,7 +22,9 @@
2222
# Many of these are from
2323
# https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py
2424

25+
# Dtype should ideally only used for public facing APIs
2526
Dtype = Union["ExtensionDtype", str, np.dtype]
27+
# DtypeObj should be used otherwise
2628
DtypeObj = Union["ExtensionDtype", np.dtype]
2729

2830
# scalars
@@ -46,3 +48,5 @@
4648
MultiColumnAggType = Union[ # noqa: UP007
4749
AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
4850
]
51+
52+
Axis = Literal[0, 1, "index", "columns"]

python/cudf/cudf/core/column/column.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3365,7 +3365,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
33653365
return columns
33663366

33673367

3368-
def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
3368+
def concat_columns(objs: Sequence[ColumnBase]) -> ColumnBase:
33693369
"""Concatenate a sequence of columns."""
33703370
if len(objs) == 0:
33713371
return column_empty(0, dtype=np.dtype(np.float64))
@@ -3386,30 +3386,33 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
33863386
# Find the first non-null column:
33873387
head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0])
33883388

3389-
for i, obj in enumerate(objs):
3389+
new_objs = list(objs)
3390+
for i, obj in enumerate(new_objs):
33903391
# Check that all columns are the same type:
33913392
if not is_dtype_equal(obj.dtype, head.dtype):
33923393
# if all null, cast to appropriate dtype
33933394
if obj.null_count == len(obj):
3394-
objs[i] = column_empty(row_count=len(obj), dtype=head.dtype)
3395+
new_objs[i] = column_empty(
3396+
row_count=len(obj), dtype=head.dtype
3397+
)
33953398
else:
33963399
raise ValueError("All columns must be the same type")
33973400

33983401
# TODO: This logic should be generalized to a dispatch to
33993402
# ColumnBase._concat so that all subclasses can override necessary
34003403
# behavior. However, at the moment it's not clear what that API should look
34013404
# like, so CategoricalColumn simply implements a minimal working API.
3402-
if all(isinstance(o.dtype, CategoricalDtype) for o in objs):
3405+
if all(isinstance(o.dtype, CategoricalDtype) for o in new_objs):
34033406
return cudf.core.column.categorical.CategoricalColumn._concat(
34043407
cast(
34053408
MutableSequence[
34063409
cudf.core.column.categorical.CategoricalColumn
34073410
],
3408-
objs,
3411+
new_objs,
34093412
)
34103413
)
34113414

3412-
newsize = sum(map(len, objs))
3415+
newsize = sum(map(len, new_objs))
34133416
if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
34143417
raise MemoryError(
34153418
f"Result of concat cannot have size > {SIZE_TYPE_DTYPE}_MAX"
@@ -3418,7 +3421,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
34183421
return column_empty(0, head.dtype)
34193422

34203423
# Filter out inputs that have 0 length, then concatenate.
3421-
objs_with_len = [o for o in objs if len(o)]
3424+
objs_with_len = [o for o in new_objs if len(o)]
34223425
with acquire_spill_lock():
34233426
return ColumnBase.from_pylibcudf(
34243427
plc.concatenate.concatenate(

python/cudf/cudf/core/dataframe.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,13 @@
124124
)
125125

126126
if TYPE_CHECKING:
127-
from cudf._typing import ColumnLike, Dtype, NotImplementedType
127+
from cudf._typing import (
128+
Axis,
129+
ColumnLike,
130+
Dtype,
131+
NotImplementedType,
132+
ScalarLike,
133+
)
128134

129135
_cupy_nan_methods_map = {
130136
"min": "nanmin",
@@ -6517,11 +6523,11 @@ def count(self, axis=0, numeric_only=False):
65176523
@_performance_tracking
65186524
def _reduce(
65196525
self,
6520-
op,
6526+
op: str,
65216527
axis=None,
6522-
numeric_only=False,
6528+
numeric_only: bool = False,
65236529
**kwargs,
6524-
):
6530+
) -> ScalarLike:
65256531
source = self
65266532

65276533
if axis is None:
@@ -6597,13 +6603,15 @@ def _reduce(
65976603
)(**kwargs)
65986604
else:
65996605
source_dtypes = [dtype for _, dtype in source._dtypes]
6606+
# TODO: What happens if common_dtype is None?
66006607
common_dtype = find_common_type(source_dtypes)
66016608
if (
66026609
common_dtype == CUDF_STRING_DTYPE
66036610
and any(
66046611
dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
66056612
)
6606-
or common_dtype.kind != "b"
6613+
or common_dtype is not None
6614+
and common_dtype.kind != "b"
66076615
and any(dtype.kind == "b" for dtype in source_dtypes)
66086616
):
66096617
raise TypeError(
@@ -6622,15 +6630,21 @@ def _reduce(
66226630
if res.isnull().all():
66236631
if cudf.api.types.is_numeric_dtype(common_dtype):
66246632
if op in {"sum", "product"}:
6625-
if common_dtype.kind == "f":
6633+
if (
6634+
common_dtype is not None
6635+
and common_dtype.kind == "f"
6636+
):
66266637
res_dtype = (
66276638
np.dtype("float64")
66286639
if isinstance(
66296640
common_dtype, pd.ArrowDtype
66306641
)
66316642
else common_dtype
66326643
)
6633-
elif common_dtype.kind == "u":
6644+
elif (
6645+
common_dtype is not None
6646+
and common_dtype.kind == "u"
6647+
):
66346648
res_dtype = np.dtype("uint64")
66356649
else:
66366650
res_dtype = np.dtype("int64")
@@ -6645,7 +6659,10 @@ def _reduce(
66456659
"skew",
66466660
"median",
66476661
}:
6648-
if common_dtype.kind == "f":
6662+
if (
6663+
common_dtype is not None
6664+
and common_dtype.kind == "f"
6665+
):
66496666
res_dtype = (
66506667
np.dtype("float64")
66516668
if isinstance(
@@ -6668,19 +6685,22 @@ def _reduce(
66686685
@_performance_tracking
66696686
def _scan(
66706687
self,
6671-
op,
6672-
axis=None,
6688+
op: str,
6689+
axis: Axis | None = None,
6690+
skipna: bool = True,
66736691
*args,
66746692
**kwargs,
6675-
):
6693+
) -> Self:
66766694
if axis is None:
66776695
axis = 0
66786696
axis = self._get_axis_from_axis_arg(axis)
66796697

66806698
if axis == 0:
6681-
return super()._scan(op, axis=axis, *args, **kwargs)
6699+
return super()._scan(op, axis=axis, skipna=skipna, *args, **kwargs)
66826700
elif axis == 1:
6683-
return self._apply_cupy_method_axis_1(op, **kwargs)
6701+
return self._apply_cupy_method_axis_1(op, skipna=skipna, **kwargs)
6702+
else:
6703+
raise ValueError(f"{axis=} should be None, 0 or 1")
66846704

66856705
@_performance_tracking
66866706
def mode(self, axis=0, numeric_only=False, dropna=True):
@@ -6808,7 +6828,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
68086828
return super(DataFrame, obj).any(axis, skipna, **kwargs)
68096829

68106830
@_performance_tracking
6811-
def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
6831+
def _apply_cupy_method_axis_1(self, method: str, *args, **kwargs):
68126832
# This method uses cupy to perform scans and reductions along rows of a
68136833
# DataFrame. Since cuDF is designed around columnar storage and
68146834
# operations, we convert DataFrames to 2D cupy arrays for these ops.

python/cudf/cudf/core/frame.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
)
5252
from types import ModuleType
5353

54-
from cudf._typing import Dtype, DtypeObj, ScalarLike
54+
from cudf._typing import Axis, Dtype, DtypeObj, ScalarLike
5555
from cudf.core.series import Series
5656

5757

@@ -92,7 +92,7 @@ def _column_labels_and_values(
9292
return zip(self._column_names, self._columns, strict=True)
9393

9494
@property
95-
def _dtypes(self) -> Generator[tuple[Hashable, Dtype], None, None]:
95+
def _dtypes(self) -> Generator[tuple[Hashable, DtypeObj], None, None]:
9696
for label, col in self._column_labels_and_values:
9797
yield label, col.dtype
9898

@@ -1772,9 +1772,28 @@ def __bool__(self) -> None:
17721772
)
17731773

17741774
@_performance_tracking
1775-
def _reduce(self, *args, **kwargs):
1775+
def _reduce(
1776+
self,
1777+
op: str,
1778+
axis=no_default,
1779+
numeric_only: bool = False,
1780+
**kwargs,
1781+
) -> ScalarLike:
1782+
raise NotImplementedError(
1783+
f"Reductions are not supported for objects of type {type(self).__name__}."
1784+
)
1785+
1786+
@_performance_tracking
1787+
def _scan(
1788+
self,
1789+
op: str,
1790+
axis: Axis | None = None,
1791+
skipna: bool = True,
1792+
*args,
1793+
**kwargs,
1794+
) -> Self:
17761795
raise NotImplementedError(
1777-
f"Reductions are not supported for objects of type {type(self)}."
1796+
f"Scans are not supported for objects of type {type(self).__name__}."
17781797
)
17791798

17801799
@_performance_tracking

python/cudf/cudf/core/index.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2185,14 +2185,6 @@ def __getitem__(self, index):
21852185
res = Index._from_column(res, name=self.name)
21862186
return res
21872187

2188-
@property # type: ignore
2189-
@_performance_tracking
2190-
def dtype(self):
2191-
"""
2192-
`dtype` of the underlying values in Index.
2193-
"""
2194-
return self._column.dtype
2195-
21962188
@_performance_tracking
21972189
def isna(self) -> cupy.ndarray:
21982190
return self._column.isnull().values

python/cudf/cudf/core/indexed_frame.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
)
9393

9494
from cudf._typing import (
95+
Axis,
9596
ColumnLike,
9697
DataFrameOrSeries,
9798
Dtype,
@@ -448,7 +449,14 @@ def _mimic_inplace(
448449
return super()._mimic_inplace(result, inplace)
449450

450451
@_performance_tracking
451-
def _scan(self, op, axis=None, skipna=True):
452+
def _scan(
453+
self,
454+
op: str,
455+
axis: Axis | None = None,
456+
skipna: bool = True,
457+
*args,
458+
**kwargs,
459+
) -> Self:
452460
"""
453461
Return {op_name} of the {cls}.
454462
@@ -488,6 +496,10 @@ def _scan(self, op, axis=None, skipna=True):
488496
2 6 24
489497
3 10 34
490498
"""
499+
if "numeric_only" in kwargs:
500+
raise TypeError(
501+
"got an unexpected keyword argument 'numeric_only'"
502+
)
491503
cast_to_int = op in ("cumsum", "cumprod")
492504
skipna = True if skipna is None else skipna
493505

python/cudf/cudf/core/series.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,12 +1475,6 @@ def list(self):
14751475
def struct(self):
14761476
return StructMethods(parent=self)
14771477

1478-
@property # type: ignore
1479-
@_performance_tracking
1480-
def dtype(self):
1481-
"""The dtype of the Series."""
1482-
return self._column.dtype
1483-
14841478
@property # type: ignore
14851479
@_performance_tracking
14861480
def dtypes(self):

0 commit comments

Comments
 (0)