Skip to content

Commit 070f03b

Browse files
committed
Cache some more properties
1 parent a70c936 commit 070f03b

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

python/cudf/cudf/core/column/column.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ class ColumnBase(Serializable, BinaryOperand, Reducible):
312312
plc_column: plc.Column
313313
_dtype: DtypeObj
314314
_distinct_count: dict[bool, int]
315+
_has_nulls: dict[bool, bool]
315316
_exposed_buffers: set[Buffer]
316317
_CACHED_PROPERTY_NAMES: ClassVar[frozenset[str]] = frozenset()
317318

@@ -357,16 +358,16 @@ def _PANDAS_NA_VALUE(self) -> ScalarLike:
357358
def dtype(self) -> DtypeObj:
358359
return self._dtype
359360

360-
@property
361+
@cached_property
361362
def size(self) -> int:
362363
return self.plc_column.size()
363364

364-
@property
365+
@cached_property
365366
def data(self) -> None | Buffer:
366367
"""Get data buffer from pylibcudf column."""
367368
return cast("Buffer | None", self.plc_column.data())
368369

369-
@property
370+
@cached_property
370371
def nullable(self) -> bool:
371372
return self.mask is not None
372373

@@ -375,9 +376,14 @@ def has_nulls(self, include_nan: bool = False) -> bool:
375376
376377
NaN inclusion is supported for specific dtypes only.
377378
"""
378-
return int(self.null_count) != 0
379+
try:
380+
return self._has_nulls[include_nan]
381+
except KeyError:
382+
result = int(self.null_count) != 0
383+
self._has_nulls[include_nan] = result
384+
return result
379385

380-
@property
386+
@cached_property
381387
def is_all_null(self) -> bool:
382388
"""Check if all values in the column are null.
383389
@@ -386,7 +392,7 @@ def is_all_null(self) -> bool:
386392
"""
387393
return self.null_count == len(self)
388394

389-
@property
395+
@cached_property
390396
def valid_count(self) -> int:
391397
"""Return the number of non-null values in the column.
392398
@@ -395,7 +401,7 @@ def valid_count(self) -> int:
395401
"""
396402
return len(self) - self.null_count
397403

398-
@property
404+
@cached_property
399405
def mask(self) -> None | Buffer:
400406
"""Get mask buffer from pylibcudf column."""
401407
return cast("Buffer | None", self.plc_column.null_mask())
@@ -425,6 +431,7 @@ def access(self, **kwargs: Any) -> _ColumnAccessContext:
425431

426432
def _clear_cache(self) -> None:
427433
self._distinct_count.clear()
434+
self._has_nulls.clear()
428435
for attr_name in self._CACHED_PROPERTY_NAMES:
429436
try:
430437
delattr(self, attr_name)
@@ -458,11 +465,11 @@ def set_mask(self, mask: Buffer | None, null_count: int) -> Self:
458465
ColumnBase.create(new_plc_column, self.dtype),
459466
)
460467

461-
@property
468+
@cached_property
462469
def null_count(self) -> int:
463470
return self.plc_column.null_count()
464471

465-
@property
472+
@cached_property
466473
def offset(self) -> int:
467474
return self.plc_column.offset()
468475

@@ -800,6 +807,7 @@ def _from_preprocessed(
800807
self.plc_column = plc_column
801808
self._dtype = dtype
802809
self._distinct_count = {}
810+
self._has_nulls = {}
803811
# The set of exposed buffers associated with this column. These buffers must be
804812
# kept alive for the lifetime of this column since anything that accessed the
805813
# CAI of this column will still be pointing to those buffers. As such objects
@@ -1883,7 +1891,6 @@ def as_mask(self) -> tuple[Buffer, int]:
18831891

18841892
@property
18851893
def is_unique(self) -> bool:
1886-
# distinct_count might already be cached
18871894
return self.distinct_count(dropna=False) == len(self)
18881895

18891896
@cached_property

0 commit comments

Comments
 (0)