Skip to content

Commit de6eb89

Browse files
authored
GH-47022: [Python] Support unsigned dictionary indices in pandas conversion (#48451)
### Rationale for this change This is the ticket mentioned in #7659 which implements unsigned dictionary indices in pandas conversion. ### What changes are included in this PR? Implements unsigned dictionary indices by upcasting to signed ints in pandas conversion ### Are these changes tested? Yes via: ``` pytest -xvs python/pyarrow/tests/test_pandas.py::test_dictionary_with_pandas ``` ### Are there any user-facing changes? Yes, `pd.Categorical.from_codes(indices, categories=dictionary)` with unsigned integers should work now as demonstrated in the tests. * GitHub Issue: #47022 Authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Adam Reeve <[email protected]>
1 parent 9c4faee commit de6eb89

File tree

2 files changed

+49
-17
lines changed

2 files changed

+49
-17
lines changed

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,8 +1863,21 @@ class CategoricalWriter
18631863
}
18641864

18651865
Status WriteIndicesUniform(const ChunkedArray& data) {
1866-
RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
1867-
T* out_values = reinterpret_cast<T*>(this->block_data_);
1866+
// For unsigned types, upcast to signed since pandas uses -1 for nulls
1867+
// uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
1868+
using OutputType = std::conditional_t<
1869+
std::is_same<T, uint8_t>::value, int16_t,
1870+
std::conditional_t<
1871+
std::is_same<T, uint16_t>::value, int32_t,
1872+
std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>>;
1873+
const int npy_output_type = std::is_same<OutputType, int16_t>::value ? NPY_INT16
1874+
: std::is_same<OutputType, int32_t>::value ? NPY_INT32
1875+
: std::is_same<OutputType, int64_t>::value
1876+
? NPY_INT64
1877+
: TRAITS::npy_type;
1878+
1879+
RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
1880+
auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
18681881

18691882
for (int c = 0; c < data.num_chunks(); c++) {
18701883
const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
@@ -1875,7 +1888,7 @@ class CategoricalWriter
18751888
// Null is -1 in CategoricalBlock
18761889
for (int i = 0; i < arr.length(); ++i) {
18771890
if (indices.IsValid(i)) {
1878-
*out_values++ = values[i];
1891+
*out_values++ = static_cast<OutputType>(values[i]);
18791892
} else {
18801893
*out_values++ = -1;
18811894
}
@@ -1928,7 +1941,11 @@ class CategoricalWriter
19281941
const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
19291942
const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
19301943

1931-
if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
1944+
// For unsigned types, we need to convert to signed for pandas compatibility
1945+
// even when there are no nulls, so we skip the fast path
1946+
const bool is_unsigned = std::is_unsigned<T>::value;
1947+
1948+
if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
19321949
RETURN_NOT_OK(
19331950
CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
19341951

@@ -2024,13 +2041,12 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
20242041
CATEGORICAL_CASE(Int16Type);
20252042
CATEGORICAL_CASE(Int32Type);
20262043
CATEGORICAL_CASE(Int64Type);
2027-
case Type::UINT8:
2028-
case Type::UINT16:
2029-
case Type::UINT32:
2044+
CATEGORICAL_CASE(UInt8Type);
2045+
CATEGORICAL_CASE(UInt16Type);
2046+
CATEGORICAL_CASE(UInt32Type);
20302047
case Type::UINT64:
20312048
return Status::TypeError(
2032-
"Converting unsigned dictionary indices to pandas",
2033-
" not yet supported, index type: ", index_type.ToString());
2049+
"Converting UInt64 dictionary indices to pandas is not supported.");
20342050
default:
20352051
// Unreachable
20362052
ARROW_DCHECK(false);

python/pyarrow/tests/test_pandas.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4114,24 +4114,40 @@ def test_dictionary_with_pandas():
41144114
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
41154115
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
41164116

4117-
if index_type[0] == 'u':
4118-
# TODO: unsigned dictionary indices to pandas
4119-
with pytest.raises(TypeError):
4117+
if index_type == 'uint64':
4118+
# uint64 is not supported due to overflow risk (values > 2^63-1)
4119+
with pytest.raises(TypeError,
4120+
match="UInt64 dictionary indices"):
41204121
d1.to_pandas()
41214122
continue
41224123

41234124
pandas1 = d1.to_pandas()
4124-
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
4125+
# Pandas Categorical uses signed int codes. Arrow converts:
4126+
# uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
4127+
if index_type == 'uint8':
4128+
compare_indices = indices.astype('int16')
4129+
elif index_type == 'uint16':
4130+
compare_indices = indices.astype('int32')
4131+
elif index_type == 'uint32':
4132+
compare_indices = indices.astype('int64')
4133+
else:
4134+
compare_indices = indices
4135+
ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary)
41254136

41264137
tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
41274138

41284139
pandas2 = d2.to_pandas()
41294140
assert pandas2.isnull().sum() == 1
41304141

4131-
# Unsigned integers converted to signed
4132-
signed_indices = indices
4133-
if index_type[0] == 'u':
4134-
signed_indices = indices.astype(index_type[1:])
4142+
# Use same conversion as above for comparison
4143+
if index_type == 'uint8':
4144+
signed_indices = indices.astype('int16')
4145+
elif index_type == 'uint16':
4146+
signed_indices = indices.astype('int32')
4147+
elif index_type == 'uint32':
4148+
signed_indices = indices.astype('int64')
4149+
else:
4150+
signed_indices = indices
41354151
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
41364152
signed_indices),
41374153
categories=dictionary)

0 commit comments

Comments
 (0)