Skip to content

Commit 18ccb7e

Browse files
committed
[Python][C++] Support unsigned dictionary indices in pandas conversion
1 parent 6456944 commit 18ccb7e

File tree

2 files changed

+41
-15
lines changed

2 files changed

+41
-15
lines changed

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,8 +1862,18 @@ class CategoricalWriter
18621862
}
18631863

18641864
Status WriteIndicesUniform(const ChunkedArray& data) {
1865-
RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
1866-
T* out_values = reinterpret_cast<T*>(this->block_data_);
1865+
// For unsigned types, upcast to signed since pandas uses -1 for nulls
1866+
// uint8/uint16 to int32, uint32 to int64, signed types unchanged
1867+
using OutputType = std::conditional_t<
1868+
std::is_same<T, uint8_t>::value || std::is_same<T, uint16_t>::value, int32_t,
1869+
std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>;
1870+
const int npy_output_type = std::is_same<OutputType, int32_t>::value ? NPY_INT32
1871+
: std::is_same<OutputType, int64_t>::value
1872+
? NPY_INT64
1873+
: TRAITS::npy_type;
1874+
1875+
RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
1876+
auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
18671877

18681878
for (int c = 0; c < data.num_chunks(); c++) {
18691879
const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
@@ -1874,7 +1884,7 @@ class CategoricalWriter
18741884
// Null is -1 in CategoricalBlock
18751885
for (int i = 0; i < arr.length(); ++i) {
18761886
if (indices.IsValid(i)) {
1877-
*out_values++ = values[i];
1887+
*out_values++ = static_cast<OutputType>(values[i]);
18781888
} else {
18791889
*out_values++ = -1;
18801890
}
@@ -1927,7 +1937,11 @@ class CategoricalWriter
19271937
const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
19281938
const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
19291939

1930-
if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
1940+
// For unsigned types, we need to convert to signed for pandas compatibility
1941+
// even when there are no nulls, so we skip the fast path
1942+
const bool is_unsigned = std::is_unsigned<T>::value;
1943+
1944+
if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
19311945
RETURN_NOT_OK(
19321946
CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
19331947

@@ -2023,9 +2037,9 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
20232037
CATEGORICAL_CASE(Int16Type);
20242038
CATEGORICAL_CASE(Int32Type);
20252039
CATEGORICAL_CASE(Int64Type);
2026-
case Type::UINT8:
2027-
case Type::UINT16:
2028-
case Type::UINT32:
2040+
CATEGORICAL_CASE(UInt8Type);
2041+
CATEGORICAL_CASE(UInt16Type);
2042+
CATEGORICAL_CASE(UInt32Type);
20292043
case Type::UINT64:
20302044
return Status::TypeError(
20312045
"Converting unsigned dictionary indices to pandas",

python/pyarrow/tests/test_pandas.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4114,24 +4114,36 @@ def test_dictionary_with_pandas():
41144114
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
41154115
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
41164116

4117-
if index_type[0] == 'u':
4118-
# TODO: unsigned dictionary indices to pandas
4119-
with pytest.raises(TypeError):
4117+
if index_type == 'uint64':
4118+
# uint64 is not supported due to overflow risk (values > 2^63-1)
4119+
with pytest.raises(TypeError,
4120+
match="Converting unsigned dictionary indices"):
41204121
d1.to_pandas()
41214122
continue
41224123

41234124
pandas1 = d1.to_pandas()
4124-
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
4125+
# Pandas Categorical uses signed int codes. Arrow converts:
4126+
# uint8/uint16 to int32, uint32 to int64, signed types unchanged
4127+
if index_type in ('uint8', 'uint16'):
4128+
compare_indices = indices.astype('int32')
4129+
elif index_type == 'uint32':
4130+
compare_indices = indices.astype('int64')
4131+
else:
4132+
compare_indices = indices
4133+
ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary)
41254134

41264135
tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
41274136

41284137
pandas2 = d2.to_pandas()
41294138
assert pandas2.isnull().sum() == 1
41304139

4131-
# Unsigned integers converted to signed
4132-
signed_indices = indices
4133-
if index_type[0] == 'u':
4134-
signed_indices = indices.astype(index_type[1:])
4140+
# Use same conversion as above for comparison
4141+
if index_type in ('uint8', 'uint16'):
4142+
signed_indices = indices.astype('int32')
4143+
elif index_type == 'uint32':
4144+
signed_indices = indices.astype('int64')
4145+
else:
4146+
signed_indices = indices
41354147
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
41364148
signed_indices),
41374149
categories=dictionary)

0 commit comments

Comments
 (0)