From 71ae658795f5dd5930799756c7e93e35d8217f7e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 18:06:40 +0200 Subject: [PATCH 1/5] adding test cases for PyObject with many nans --- pandas/tests/libs/test_hashtable.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 0edcebdc069f4..9b1813962bb69 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -339,6 +339,29 @@ def test_unique(self, table_type, dtype): assert np.all(np.isnan(unique)) and len(unique) == 1 +def test_unique_for_nan_objects_floats(): + table = ht.PyObjectHashTable() + keys = np.array([float("nan") for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_complex(): + table = ht.PyObjectHashTable() + keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_tuple(): + table = ht.PyObjectHashTable() + keys = np.array( + [1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_ + ) + unique = table.unique(keys) + assert len(unique) == 2 + + def get_ht_function(fun_name, type_suffix): return getattr(ht, fun_name) From 152c0008ddcbfe1beeaf0a279e506da576417e1f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 21:49:05 +0200 Subject: [PATCH 2/5] fix hash for floats --- pandas/_libs/src/klib/khash_python.h | 34 +++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 87c6283c19a2f..1c37a6dd1e4b2 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -251,12 +251,40 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { } +Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){ + //Since Python3.10, nan is no longer has hash 0 + if (Py_IS_NAN(val)) { + return 0; + } +#if PY_VERSION_HEX < 0x030A0000 + return _Py_HashDouble(val); +#else + return _Py_HashDouble(NULL, val); +#endif +} + + +Py_hash_t PANDAS_INLINE hash_float(PyFloatObject* key){ + return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); +} + + khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ + Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) - // hash(X) == 0 if X is a NaN-value - // so it is OK to use it directly for doubles - Py_hash_t hash = PyObject_Hash(key); + // yet for different nan-object different hash-values + // are possible + if (PyFloat_CheckExact(key)) { + // we cannot use kh_float64_hash_func + // becase float(k) == k holds for any int-object k + // and kh_float64_hash_func doesn't respect it + hash = hash_float((PyFloatObject*)key); + } + else { + hash = PyObject_Hash(key); + } + if (hash == -1) { PyErr_Clear(); return 0; From 1c10229db2965d50d28ba78b57e708ba09e455b0 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 21:54:23 +0200 Subject: [PATCH 3/5] adding test for 5 == 5.0 == 5+0j --- pandas/tests/libs/test_hashtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 9b1813962bb69..a1a43fa6ef300 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -520,3 +520,11 @@ def test_ismember_tuple_with_nans(): result = isin(values, comps) expected = np.array([True, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) + + +def test_float_complex_int_are_equal_as_objects(): + values = ["a", 5, 5.0, 5.0 + 0j] + comps = list(range(129)) + result = isin(values, comps) + expected = np.array([False, True, True, True], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) From fdbfdecff96dc566f0acb48dcec01f574c874090 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 12 Jun 2021 07:27:09 +0200 Subject: [PATCH 4/5] hash function for complex --- pandas/_libs/src/klib/khash_python.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 1c37a6dd1e4b2..412edbc74d7d1 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -264,11 +264,26 @@ Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){ } -Py_hash_t PANDAS_INLINE hash_float(PyFloatObject* key){ +Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key){ return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); } +// replaces _Py_HashDouble with _Pandas_HashDouble +Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){ + Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); + Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); + if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { + return -1; + } + Py_uhash_t combined = realhash + _PyHASH_IMAG * imaghash; + if (combined == (Py_uhash_t)-1) { + return -2; + } + return (Py_hash_t)combined; +} + + khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ Py_hash_t hash; // For PyObject_Hash holds: @@ -279,7 +294,13 @@ khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ // we cannot use kh_float64_hash_func // becase float(k) == k holds for any int-object k // and kh_float64_hash_func doesn't respect it - hash = hash_float((PyFloatObject*)key); + hash = floatobject_hash((PyFloatObject*)key); + } + else if (PyComplex_CheckExact(key)) { + // we cannot use kh_complex128_hash_func + // becase complex(k,0) == k holds for any int-object k + // and kh_complex128_hash_func doesn't respect it + hash = complexobject_hash((PyComplexObject*)key); } else { hash = PyObject_Hash(key); From e455feda8b24ef336f3f8ff63d6037169da92ab2 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 13 Jun 2021 14:23:03 +0200 Subject: [PATCH 5/5] hash function for tuples --- pandas/_libs/src/klib/khash_python.h | 54 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 412edbc74d7d1..c8e1ca5ebb4d3 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -251,7 +251,7 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { } -Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){ +Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { //Since Python3.10, nan is no longer has hash 0 if (Py_IS_NAN(val)) { return 0; @@ -264,13 +264,13 @@ Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){ } -Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key){ +Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); } // replaces _Py_HashDouble with _Pandas_HashDouble -Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){ +Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { @@ -284,11 +284,52 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){ } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); + +//we could use any hashing algorithm, this is the original CPython's for tuples + +#if SIZEOF_PY_UHASH_T > 4 +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif + +Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { + Py_ssize_t i, len = Py_SIZE(key); + PyObject **item = key->ob_item; + + Py_uhash_t acc = _PandasHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = kh_python_hash_func(item[i]); + if (lane == (Py_uhash_t)-1) { + return -1; + } + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; + } + return acc; +} + + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) - // yet for different nan-object different hash-values + // yet for different nan-objects different hash-values // are possible if (PyFloat_CheckExact(key)) { // we cannot use kh_float64_hash_func @@ -302,6 +343,9 @@ khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject*)key); } + else if (PyTuple_CheckExact(key)) { + hash = tupleobject_hash((PyTupleObject*)key); + } else { hash = PyObject_Hash(key); }