Skip to content

Commit 27a89f3

Browse files
committed
[Python][C++] Support unsigned dictionary indices in pandas conversion
1 parent 6456944 commit 27a89f3

File tree

2 files changed

+47
-15
lines changed

2 files changed

+47
-15
lines changed

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,8 +1862,20 @@ class CategoricalWriter
18621862
}
18631863

18641864
Status WriteIndicesUniform(const ChunkedArray& data) {
1865-
RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
1866-
T* out_values = reinterpret_cast<T*>(this->block_data_);
1865+
// For unsigned types, upcast to signed since pandas uses -1 for nulls
1866+
// uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
1867+
using OutputType = std::conditional_t<
1868+
std::is_same<T, uint8_t>::value, int16_t,
1869+
std::conditional_t<std::is_same<T, uint16_t>::value, int32_t,
1870+
std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>>;
1871+
const int npy_output_type = std::is_same<OutputType, int16_t>::value ? NPY_INT16
1872+
: std::is_same<OutputType, int32_t>::value ? NPY_INT32
1873+
: std::is_same<OutputType, int64_t>::value
1874+
? NPY_INT64
1875+
: TRAITS::npy_type;
1876+
1877+
RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
1878+
auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
18671879

18681880
for (int c = 0; c < data.num_chunks(); c++) {
18691881
const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
@@ -1874,7 +1886,7 @@ class CategoricalWriter
18741886
// Null is -1 in CategoricalBlock
18751887
for (int i = 0; i < arr.length(); ++i) {
18761888
if (indices.IsValid(i)) {
1877-
*out_values++ = values[i];
1889+
*out_values++ = static_cast<OutputType>(values[i]);
18781890
} else {
18791891
*out_values++ = -1;
18801892
}
@@ -1927,7 +1939,11 @@ class CategoricalWriter
19271939
const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
19281940
const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
19291941

1930-
if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
1942+
// For unsigned types, we need to convert to signed for pandas compatibility
1943+
// even when there are no nulls, so we skip the fast path
1944+
const bool is_unsigned = std::is_unsigned<T>::value;
1945+
1946+
if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
19311947
RETURN_NOT_OK(
19321948
CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
19331949

@@ -2023,9 +2039,9 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
20232039
CATEGORICAL_CASE(Int16Type);
20242040
CATEGORICAL_CASE(Int32Type);
20252041
CATEGORICAL_CASE(Int64Type);
2026-
case Type::UINT8:
2027-
case Type::UINT16:
2028-
case Type::UINT32:
2042+
CATEGORICAL_CASE(UInt8Type);
2043+
CATEGORICAL_CASE(UInt16Type);
2044+
CATEGORICAL_CASE(UInt32Type);
20292045
case Type::UINT64:
20302046
return Status::TypeError(
20312047
"Converting unsigned dictionary indices to pandas",

python/pyarrow/tests/test_pandas.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4114,24 +4114,40 @@ def test_dictionary_with_pandas():
41144114
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
41154115
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
41164116

4117-
if index_type[0] == 'u':
4118-
# TODO: unsigned dictionary indices to pandas
4119-
with pytest.raises(TypeError):
4117+
if index_type == 'uint64':
4118+
# uint64 is not supported due to overflow risk (values > 2^63-1)
4119+
with pytest.raises(TypeError,
4120+
match="Converting unsigned dictionary indices"):
41204121
d1.to_pandas()
41214122
continue
41224123

41234124
pandas1 = d1.to_pandas()
4124-
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
4125+
# Pandas Categorical uses signed int codes. Arrow converts:
4126+
# uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
4127+
if index_type == 'uint8':
4128+
compare_indices = indices.astype('int16')
4129+
elif index_type == 'uint16':
4130+
compare_indices = indices.astype('int32')
4131+
elif index_type == 'uint32':
4132+
compare_indices = indices.astype('int64')
4133+
else:
4134+
compare_indices = indices
4135+
ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary)
41254136

41264137
tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
41274138

41284139
pandas2 = d2.to_pandas()
41294140
assert pandas2.isnull().sum() == 1
41304141

4131-
# Unsigned integers converted to signed
4132-
signed_indices = indices
4133-
if index_type[0] == 'u':
4134-
signed_indices = indices.astype(index_type[1:])
4142+
# Use same conversion as above for comparison
4143+
if index_type == 'uint8':
4144+
signed_indices = indices.astype('int16')
4145+
if index_type == 'uint16':
4146+
signed_indices = indices.astype('int32')
4147+
elif index_type == 'uint32':
4148+
signed_indices = indices.astype('int64')
4149+
else:
4150+
signed_indices = indices
41354151
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
41364152
signed_indices),
41374153
categories=dictionary)

0 commit comments

Comments
 (0)