Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.api.types.is_file_like PR07,SA01" \
-i "pandas.api.types.is_float PR01,SA01" \
-i "pandas.api.types.is_float_dtype SA01" \
-i "pandas.api.types.is_hashable PR01,RT03,SA01" \
-i "pandas.api.types.is_hashable PR01,SA01" \
-i "pandas.api.types.is_int64_dtype SA01" \
-i "pandas.api.types.is_integer PR01,SA01" \
-i "pandas.api.types.is_integer_dtype SA01" \
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ...
def is_integer(obj: object) -> TypeGuard[int | np.integer]: ...
def is_int_or_none(obj) -> bool: ...
def is_float(obj: object) -> TypeGuard[float]: ...
def is_hashable(obj: object) -> TypeGuard[Hashable]: ...
def is_interval_array(values: np.ndarray) -> bool: ...
def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ...
def is_timedelta_or_timedelta64_array(
Expand Down
78 changes: 78 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,30 @@ from cpython.datetime cimport (
time,
timedelta,
)
from cpython.dict cimport PyDict_CheckExact
from cpython.float cimport PyFloat_CheckExact
from cpython.iterator cimport PyIter_Check
from cpython.list cimport PyList_CheckExact
from cpython.long cimport PyLong_CheckExact
from cpython.number cimport PyNumber_Check
from cpython.object cimport (
Py_EQ,
PyObject,
PyObject_Hash,
PyObject_RichCompareBool,
)
from cpython.ref cimport Py_INCREF
from cpython.sequence cimport PySequence_Check
from cpython.set cimport (
PyAnySet_CheckExact,
PyFrozenSet_CheckExact,
)
from cpython.tuple cimport (
PyTuple_CheckExact,
PyTuple_New,
PyTuple_SET_ITEM,
)
from cpython.unicode cimport PyUnicode_CheckExact
from cython cimport (
Py_ssize_t,
floating,
Expand Down Expand Up @@ -1089,6 +1100,73 @@ def is_float(obj: object) -> bool:
return util.is_float_object(obj)


cpdef bint is_hashable(object obj) noexcept:
"""
Return True if hash(obj) will succeed, False otherwise.

Some types will pass a test against collections.abc.Hashable but fail when
they are actually hashed with hash().

Distinguish between these and other types by trying the call to hash() and
seeing if they raise TypeError.

Returns
-------
bool

Examples
--------
>>> import collections
>>> from pandas.api.types import is_hashable
>>> a = ([],)
>>> isinstance(a, collections.abc.Hashable)
True
>>> is_hashable(a)
False
"""
cdef:
bint is_none
bint is_long
bint is_float
bint is_unicode
bint is_tuple
bint is_frozen_set
bint is_dict
bint is_list
bint is_any_set

# Perform all checks in order to be nice to the branch predictor
is_none = obj is None
is_long = PyLong_CheckExact(obj)
is_float = PyFloat_CheckExact(obj)
is_unicode = PyUnicode_CheckExact(obj)
is_tuple = PyTuple_CheckExact(obj)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason to use CheckExact over Check for the container types? I think the latter is preferable, given subclasses of these aren't that uncommon

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The motivation for this is pandas' own FrozenList which is hashable and a subclass of list. I agree that PyXXXX_Check(...) would be more intuitive but due to corner cases like FrozenList I think we need the exact checks to maintain the correct behaviour

is_frozen_set = PyFrozenSet_CheckExact(obj)
is_dict = PyDict_CheckExact(obj)
is_list = PyList_CheckExact(obj)
is_any_set = PyAnySet_CheckExact(obj)

if is_none or is_long or is_float or is_unicode or is_frozen_set:
return True

# tuple is hashable if and only if all elements are hashable
if is_tuple:
for o in <tuple>obj:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the cast actually matter? I again would think Cython would just always defer to PySequence_GetItem for iteration.

If that's true, I suppose there could be some difference between PySequence_GetItem and a more exact PyTuple_GETITEM call that could be dispatched to, with the latter not having to deal with reference counting. However, the point of writing in Cython is to happily trade some performance compared to the raw C code for some higher level abstractions.

Where that line really matters is up for debate, but I generally worry about trying to be too exact in this function. I'm also a little unclear why the non-hashable items can't short-circuit to quickly throw an error during a PyObject_Hash call. I would think the Python built-in tuple and frozen set should be able to do this faster than us?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

I've looked into the generated C-code for this version with the casts and a version without the cast. Assuming that the C-compiler is able to hoist some conditions out of the generated loops, the generated assembly for the two versions should be exactly the same. I went too far down the path of micro optimisations.

As to whether the built-in tuple and frozenset can do this faster than us I've looked into the implementations in CPython. tuple essentially call the hash function on all the elements in a loop and do some bit-twiddling to combine the hashes into one hash. The C-code of that loop is very similar to that generated by cython but we save calls to PyObject_Hash(...) on the elements which on the micro level should give a nice speed-up. For frozenset I've discover that it always is hashable, which makes perfect sense as elements must hashable to be put in a set. This discovery allows me to make some simplifications to the code.

Generated C-code

With casts

image
image

Without casts

image
CPython implementation

tuple

https://github.com/python/cpython/blob/0cba289870d5cd41f24b2f63b9480e4593aa2330/Objects/tupleobject.c#L318-L342

Note PyTuple_GET_ITEM(...) is essentially just indexing into the underlying array
https://github.com/python/cpython/blob/0cba289870d5cd41f24b2f63b9480e4593aa2330/Include/cpython/tupleobject.h#L27

if not is_hashable(o):
return False
return True

if is_dict or is_list or is_any_set:
return False

try:
PyObject_Hash(obj)
except TypeError:
return False
else:
return True


def is_integer(obj: object) -> bool:
"""
Return True if given object is integer.
Expand Down
43 changes: 2 additions & 41 deletions pandas/core/dtypes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
from pandas._libs import lib

if TYPE_CHECKING:
from collections.abc import Hashable

from pandas._typing import TypeGuard

is_bool = lib.is_bool
Expand All @@ -23,6 +21,8 @@

is_float = lib.is_float

is_hashable = lib.is_hashable

is_complex = lib.is_complex

is_scalar = lib.is_scalar
Expand Down Expand Up @@ -330,45 +330,6 @@ def is_named_tuple(obj: object) -> bool:
return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields")


def is_hashable(obj: object) -> TypeGuard[Hashable]:
"""
Return True if hash(obj) will succeed, False otherwise.

Some types will pass a test against collections.abc.Hashable but fail when
they are actually hashed with hash().

Distinguish between these and other types by trying the call to hash() and
seeing if they raise TypeError.

Returns
-------
bool

Examples
--------
>>> import collections
>>> from pandas.api.types import is_hashable
>>> a = ([],)
>>> isinstance(a, collections.abc.Hashable)
True
>>> is_hashable(a)
False
"""
# Unfortunately, we can't use isinstance(obj, collections.abc.Hashable),
# which can be faster than calling hash. That is because numpy scalars
# fail this test.

# Reconsider this decision once this numpy bug is fixed:
# https://github.com/numpy/numpy/issues/5562

try:
hash(obj)
except TypeError:
return False
else:
return True


def is_sequence(obj: object) -> bool:
"""
Check if the object is a sequence of objects.
Expand Down