Skip to content

Commit 0e08f57

Browse files
committed
[Python][C++] Support unsigned dictionary indices in pandas conversion
1 parent 6456944 commit 0e08f57

File tree

2 files changed

+29
-21
lines changed

2 files changed

+29
-21
lines changed

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,8 +1862,13 @@ class CategoricalWriter
18621862
}
18631863

18641864
Status WriteIndicesUniform(const ChunkedArray& data) {
1865-
RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
1866-
T* out_values = reinterpret_cast<T*>(this->block_data_);
1865+
// For unsigned types, convert to int32 since pandas uses -1 for nulls
1866+
const bool is_unsigned = std::is_unsigned<T>::value;
1867+
using OutputType = std::conditional_t<is_unsigned, int32_t, T>;
1868+
const int npy_output_type = is_unsigned ? NPY_INT32 : TRAITS::npy_type;
1869+
1870+
RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
1871+
auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
18671872

18681873
for (int c = 0; c < data.num_chunks(); c++) {
18691874
const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
@@ -1874,7 +1879,7 @@ class CategoricalWriter
18741879
// Null is -1 in CategoricalBlock
18751880
for (int i = 0; i < arr.length(); ++i) {
18761881
if (indices.IsValid(i)) {
1877-
*out_values++ = values[i];
1882+
*out_values++ = static_cast<OutputType>(values[i]);
18781883
} else {
18791884
*out_values++ = -1;
18801885
}
@@ -1927,7 +1932,11 @@ class CategoricalWriter
19271932
const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
19281933
const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
19291934

1930-
if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
1935+
// For unsigned types, we need to convert to signed for pandas compatibility
1936+
// even when there are no nulls, so we skip the fast path
1937+
const bool is_unsigned = std::is_unsigned<T>::value;
1938+
1939+
if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
19311940
RETURN_NOT_OK(
19321941
CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
19331942

@@ -2023,13 +2032,10 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
20232032
CATEGORICAL_CASE(Int16Type);
20242033
CATEGORICAL_CASE(Int32Type);
20252034
CATEGORICAL_CASE(Int64Type);
2026-
case Type::UINT8:
2027-
case Type::UINT16:
2028-
case Type::UINT32:
2029-
case Type::UINT64:
2030-
return Status::TypeError(
2031-
"Converting unsigned dictionary indices to pandas",
2032-
" not yet supported, index type: ", index_type.ToString());
2035+
CATEGORICAL_CASE(UInt8Type);
2036+
CATEGORICAL_CASE(UInt16Type);
2037+
CATEGORICAL_CASE(UInt32Type);
2038+
CATEGORICAL_CASE(UInt64Type);
20332039
default:
20342040
// Unreachable
20352041
ARROW_DCHECK(false);

python/pyarrow/tests/test_pandas.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4114,24 +4114,26 @@ def test_dictionary_with_pandas():
41144114
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
41154115
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
41164116

4117-
if index_type[0] == 'u':
4118-
# TODO: unsigned dictionary indices to pandas
4119-
with pytest.raises(TypeError):
4120-
d1.to_pandas()
4121-
continue
4122-
41234117
pandas1 = d1.to_pandas()
4124-
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
4118+
# Pandas Categorical uses signed int for codes.
4119+
# For unsigned inputs, Arrow converts to signed int32 internally.
4120+
if index_type[0] == 'u':
4121+
# Unsigned indices are converted to int32
4122+
ex_pandas1 = pd.Categorical.from_codes(
4123+
indices.astype('int32'), categories=dictionary)
4124+
else:
4125+
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
41254126

41264127
tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
41274128

41284129
pandas2 = d2.to_pandas()
41294130
assert pandas2.isnull().sum() == 1
41304131

4131-
# Unsigned integers converted to signed
4132-
signed_indices = indices
4132+
# Unsigned integers converted to int32, signed integers keep their type
41334133
if index_type[0] == 'u':
4134-
signed_indices = indices.astype(index_type[1:])
4134+
signed_indices = indices.astype('int32')
4135+
else:
4136+
signed_indices = indices
41354137
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
41364138
signed_indices),
41374139
categories=dictionary)

0 commit comments

Comments
 (0)