From 7e384a200d9956cd1f596e4e57ee225b061d883a Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 11 Dec 2025 14:04:43 +0900 Subject: [PATCH 1/3] [Python][C++] Support unsigned dictionary indices in pandas conversion --- .../src/arrow/python/arrow_to_pandas.cc | 31 ++++++++++++++---- python/pyarrow/tests/test_pandas.py | 32 ++++++++++++++----- 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index ed4f394362a..b442c92a93b 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1862,8 +1862,21 @@ class CategoricalWriter } Status WriteIndicesUniform(const ChunkedArray& data) { - RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1)); - T* out_values = reinterpret_cast(this->block_data_); + // For unsigned types, upcast to signed since pandas uses -1 for nulls + // uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged + using OutputType = std::conditional_t< + std::is_same::value, int16_t, + std::conditional_t< + std::is_same::value, int32_t, + std::conditional_t::value, int64_t, T>>>; + const int npy_output_type = std::is_same::value ? NPY_INT16 + : std::is_same::value ? NPY_INT32 + : std::is_same::value + ? NPY_INT64 + : TRAITS::npy_type; + + RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1)); + auto out_values = reinterpret_cast(this->block_data_); for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -1874,7 +1887,7 @@ class CategoricalWriter // Null is -1 in CategoricalBlock for (int i = 0; i < arr.length(); ++i) { if (indices.IsValid(i)) { - *out_values++ = values[i]; + *out_values++ = static_cast(values[i]); } else { *out_values++ = -1; } @@ -1927,7 +1940,11 @@ class CategoricalWriter const auto& arr_first = checked_cast(*data.chunk(0)); const auto indices_first = std::static_pointer_cast(arr_first.indices()); - if (data.num_chunks() == 1 && indices_first->null_count() == 0) { + // For unsigned types, we need to convert to signed for pandas compatibility + // even when there are no nulls, so we skip the fast path + const bool is_unsigned = std::is_unsigned::value; + + if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) { RETURN_NOT_OK( CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length())); @@ -2023,9 +2040,9 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, CATEGORICAL_CASE(Int16Type); CATEGORICAL_CASE(Int32Type); CATEGORICAL_CASE(Int64Type); - case Type::UINT8: - case Type::UINT16: - case Type::UINT32: + CATEGORICAL_CASE(UInt8Type); + CATEGORICAL_CASE(UInt16Type); + CATEGORICAL_CASE(UInt32Type); case Type::UINT64: return Status::TypeError( "Converting unsigned dictionary indices to pandas", diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 7f9b04eaabd..24afdb18c5d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4114,24 +4114,40 @@ def test_dictionary_with_pandas(): d1 = pa.DictionaryArray.from_arrays(indices, dictionary) d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask) - if index_type[0] == 'u': - # TODO: unsigned dictionary indices to pandas - with pytest.raises(TypeError): + if index_type == 'uint64': + # uint64 is not supported due to overflow risk (values > 2^63-1) + with pytest.raises(TypeError, + match="Converting unsigned dictionary indices"): d1.to_pandas() continue pandas1 = d1.to_pandas() - ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary) + # Pandas Categorical uses signed int codes. Arrow converts: + # uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged + if index_type == 'uint8': + compare_indices = indices.astype('int16') + elif index_type == 'uint16': + compare_indices = indices.astype('int32') + elif index_type == 'uint32': + compare_indices = indices.astype('int64') + else: + compare_indices = indices + ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary) tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1)) pandas2 = d2.to_pandas() assert pandas2.isnull().sum() == 1 - # Unsigned integers converted to signed - signed_indices = indices - if index_type[0] == 'u': - signed_indices = indices.astype(index_type[1:]) + # Use same conversion as above for comparison + if index_type == 'uint8': + signed_indices = indices.astype('int16') + elif index_type == 'uint16': + signed_indices = indices.astype('int32') + elif index_type == 'uint32': + signed_indices = indices.astype('int64') + else: + signed_indices = indices ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1, signed_indices), categories=dictionary) From 1c08e54ca9e8b1aeaa855cacf076f7512a280b43 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 19 Dec 2025 16:40:34 +0900 Subject: [PATCH 2/3] Address a comment --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index b442c92a93b..d8653595dfd 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2045,8 +2045,7 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, CATEGORICAL_CASE(UInt32Type); case Type::UINT64: return Status::TypeError( - "Converting unsigned dictionary indices to pandas", - " not yet supported, index type: ", index_type.ToString()); + "Converting UInt64 dictionary indices to pandas is not supported."); default: // Unreachable ARROW_DCHECK(false); From 4fe575e69fd6910061ba16fc1719d6115f3fcd51 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 23 Dec 2025 09:48:46 +0900 Subject: [PATCH 3/3] fix the test together --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 24afdb18c5d..4bcee62c37a 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4117,7 +4117,7 @@ def test_dictionary_with_pandas(): if index_type == 'uint64': # uint64 is not supported due to overflow risk (values > 2^63-1) with pytest.raises(TypeError, - match="Converting unsigned dictionary indices"): + match="UInt64 dictionary indices"): d1.to_pandas() continue