From 76e3cf38511d92bc3f4792ce530c6ad54752319e Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Tue, 20 Aug 2024 20:35:01 +0200 Subject: [PATCH 1/7] Add fast version of is_hashable --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 35 +++++++++++++++++++++++++++ pandas/core/dtypes/inference.py | 43 ++------------------------------- 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..71539101860e6 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -48,6 +48,7 @@ def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... def is_int_or_none(obj) -> bool: ... def is_float(obj: object) -> TypeGuard[float]: ... +def is_hashable(obj: object) -> TypeGuard[Hashable]: ... def is_interval_array(values: np.ndarray) -> bool: ... def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... def is_timedelta_or_timedelta64_array( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..34a06a96c06f2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -24,6 +24,7 @@ from cpython.number cimport PyNumber_Check from cpython.object cimport ( Py_EQ, PyObject, + PyObject_Hash, PyObject_RichCompareBool, ) from cpython.ref cimport Py_INCREF @@ -1089,6 +1090,40 @@ def is_float(obj: object) -> bool: return util.is_float_object(obj) +def is_hashable(obj: object) -> bool: + """ + Return True if hash(obj) will succeed, False otherwise. + + Some types will pass a test against collections.abc.Hashable but fail when + they are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Returns + ------- + bool + + Examples + -------- + >>> import collections + >>> from pandas.api.types import is_hashable + >>> a = ([],) + >>> isinstance(a, collections.abc.Hashable) + True + >>> is_hashable(a) + False + """ + # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable), + # which can be faster than calling hash. That is because numpy scalars + # fail this test. + + # Reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + return PyObject_Hash(obj) != -1 + + def is_integer(obj: object) -> bool: """ Return True if given object is integer. diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f042911b53d2b..3006a3303e24d 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -13,8 +13,6 @@ from pandas._libs import lib if TYPE_CHECKING: - from collections.abc import Hashable - from pandas._typing import TypeGuard is_bool = lib.is_bool @@ -23,6 +21,8 @@ is_float = lib.is_float +is_hashable = lib.is_hashable + is_complex = lib.is_complex is_scalar = lib.is_scalar @@ -330,45 +330,6 @@ def is_named_tuple(obj: object) -> bool: return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields") -def is_hashable(obj: object) -> TypeGuard[Hashable]: - """ - Return True if hash(obj) will succeed, False otherwise. - - Some types will pass a test against collections.abc.Hashable but fail when - they are actually hashed with hash(). - - Distinguish between these and other types by trying the call to hash() and - seeing if they raise TypeError. - - Returns - ------- - bool - - Examples - -------- - >>> import collections - >>> from pandas.api.types import is_hashable - >>> a = ([],) - >>> isinstance(a, collections.abc.Hashable) - True - >>> is_hashable(a) - False - """ - # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable), - # which can be faster than calling hash. That is because numpy scalars - # fail this test. - - # Reconsider this decision once this numpy bug is fixed: - # https://github.com/numpy/numpy/issues/5562 - - try: - hash(obj) - except TypeError: - return False - else: - return True - - def is_sequence(obj: object) -> bool: """ Check if the object is a sequence of objects. From 8eb6ac41fa7426de9c1a02fdc881851611f87a0b Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Tue, 20 Aug 2024 21:05:47 +0200 Subject: [PATCH 2/7] Catch exceptions --- pandas/_libs/lib.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 34a06a96c06f2..3667981bc1b4f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1121,7 +1121,12 @@ def is_hashable(obj: object) -> bool: # Reconsider this decision once this numpy bug is fixed: # https://github.com/numpy/numpy/issues/5562 - return PyObject_Hash(obj) != -1 + try: + PyObject_Hash(obj) + except TypeError: + return False + else: + return True def is_integer(obj: object) -> bool: From e7e392bb96b61b73a02b7a0a9b3236655e474e63 Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Wed, 21 Aug 2024 21:10:51 +0200 Subject: [PATCH 3/7] Import performance of is_hashable by checking for common types --- pandas/_libs/lib.pyx | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3667981bc1b4f..419fed6453de5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -19,7 +19,11 @@ from cpython.datetime cimport ( time, timedelta, ) +from cpython.dict cimport PyDict_CheckExact +from cpython.float cimport PyFloat_CheckExact from cpython.iterator cimport PyIter_Check +from cpython.list cimport PyList_CheckExact +from cpython.long cimport PyLong_CheckExact from cpython.number cimport PyNumber_Check from cpython.object cimport ( Py_EQ, @@ -29,10 +33,16 @@ from cpython.object cimport ( ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check +from cpython.set cimport ( + PyAnySet_CheckExact, + PyFrozenSet_CheckExact, +) from cpython.tuple cimport ( + PyTuple_CheckExact, PyTuple_New, PyTuple_SET_ITEM, ) +from cpython.unicode cimport PyUnicode_CheckExact from cython cimport ( Py_ssize_t, floating, @@ -1090,7 +1100,7 @@ def is_float(obj: object) -> bool: return util.is_float_object(obj) -def is_hashable(obj: object) -> bool: +cpdef bint is_hashable(object obj): """ Return True if hash(obj) will succeed, False otherwise. @@ -1114,12 +1124,26 @@ def is_hashable(obj: object) -> bool: >>> is_hashable(a) False """ - # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable), - # which can be faster than calling hash. That is because numpy scalars - # fail this test. + if ( + PyLong_CheckExact(obj) + or PyFloat_CheckExact(obj) + or PyUnicode_CheckExact(obj) + ): + return True + + # tuple and frozenset is hashable if and only if all elements are hashable + if PyTuple_CheckExact(obj) or PyFrozenSet_CheckExact(obj): + for o in obj: + if not is_hashable(o): + return False + return True - # Reconsider this decision once this numpy bug is fixed: - # https://github.com/numpy/numpy/issues/5562 + if ( + PyDict_CheckExact(obj) + or PyList_CheckExact(obj) + or PyAnySet_CheckExact(obj) + ): + return False try: PyObject_Hash(obj) From 3e4453321aeb8d86d9cb96f278dc105d9eb30767 Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Wed, 21 Aug 2024 22:04:56 +0200 Subject: [PATCH 4/7] Fix docstring check --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 324305417a600..5f991312b7e10 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -154,7 +154,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_float_dtype SA01" \ - -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ + -i "pandas.api.types.is_hashable PR01,SA01" \ -i "pandas.api.types.is_int64_dtype SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_integer_dtype SA01" \ From 5f79b59498cfdd2191e70e996875109993f32f75 Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Wed, 28 Aug 2024 21:38:16 +0200 Subject: [PATCH 5/7] Clean up is_hashable --- pandas/_libs/lib.pyx | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 419fed6453de5..d9c4241d2c5f9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1124,25 +1124,45 @@ cpdef bint is_hashable(object obj): >>> is_hashable(a) False """ - if ( - PyLong_CheckExact(obj) - or PyFloat_CheckExact(obj) - or PyUnicode_CheckExact(obj) - ): + cdef: + bint is_none + bint is_long + bint is_float + bint is_unicode + bint is_tuple + bint is_frozen_set + bint is_dict + bint is_list + bint is_any_set + + # Perform all checks in order to be nice to the branch predictor + is_none = obj is None + is_long = PyLong_CheckExact(obj) + is_float = PyFloat_CheckExact(obj) + is_unicode = PyUnicode_CheckExact(obj) + is_tuple = PyTuple_CheckExact(obj) + is_frozen_set = PyFrozenSet_CheckExact(obj) + is_dict = PyDict_CheckExact(obj) + is_list = PyList_CheckExact(obj) + is_any_set = PyAnySet_CheckExact(obj) + + if is_none or is_long or is_float or is_unicode: return True # tuple and frozenset is hashable if and only if all elements are hashable - if PyTuple_CheckExact(obj) or PyFrozenSet_CheckExact(obj): - for o in obj: + if is_tuple: + for o in obj: if not is_hashable(o): return False return True - if ( - PyDict_CheckExact(obj) - or PyList_CheckExact(obj) - or PyAnySet_CheckExact(obj) - ): + if is_frozen_set: + for o in obj: + if not is_hashable(o): + return False + return True + + if is_dict or is_list or is_any_set: return False try: From 4e3ff4817a7f3a1ba1c22f92a0e79c23743e8feb Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Sat, 31 Aug 2024 22:19:41 +0200 Subject: [PATCH 6/7] frozenset is always hashable --- pandas/_libs/lib.pyx | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d9c4241d2c5f9..ba6bedd7e25a8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1146,22 +1146,16 @@ cpdef bint is_hashable(object obj): is_list = PyList_CheckExact(obj) is_any_set = PyAnySet_CheckExact(obj) - if is_none or is_long or is_float or is_unicode: + if is_none or is_long or is_float or is_unicode or is_frozen_set: return True - # tuple and frozenset is hashable if and only if all elements are hashable + # tuple is hashable if and only if all elements are hashable if is_tuple: for o in obj: if not is_hashable(o): return False return True - if is_frozen_set: - for o in obj: - if not is_hashable(o): - return False - return True - if is_dict or is_list or is_any_set: return False From 1f510617febfcf291765734b1f9843119d413650 Mon Sep 17 00:00:00 2001 From: Tolker-KU Date: Sat, 31 Aug 2024 22:23:13 +0200 Subject: [PATCH 7/7] Mark as noexcept for better C codegen --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ba6bedd7e25a8..c2f0c9f86bbfa 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1100,7 +1100,7 @@ def is_float(obj: object) -> bool: return util.is_float_object(obj) -cpdef bint is_hashable(object obj): +cpdef bint is_hashable(object obj) noexcept: """ Return True if hash(obj) will succeed, False otherwise.