Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1862,8 +1862,21 @@ class CategoricalWriter
}

Status WriteIndicesUniform(const ChunkedArray& data) {
RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
T* out_values = reinterpret_cast<T*>(this->block_data_);
// For unsigned types, upcast to signed since pandas uses -1 for nulls
// uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
using OutputType = std::conditional_t<
std::is_same<T, uint8_t>::value, int16_t,
std::conditional_t<
std::is_same<T, uint16_t>::value, int32_t,
std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>>;
const int npy_output_type = std::is_same<OutputType, int16_t>::value ? NPY_INT16
: std::is_same<OutputType, int32_t>::value ? NPY_INT32
: std::is_same<OutputType, int64_t>::value
? NPY_INT64
: TRAITS::npy_type;

RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
auto out_values = reinterpret_cast<OutputType*>(this->block_data_);

for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
Expand All @@ -1874,7 +1887,7 @@ class CategoricalWriter
// Null is -1 in CategoricalBlock
for (int i = 0; i < arr.length(); ++i) {
if (indices.IsValid(i)) {
*out_values++ = values[i];
*out_values++ = static_cast<OutputType>(values[i]);
} else {
*out_values++ = -1;
}
Expand Down Expand Up @@ -1927,7 +1940,11 @@ class CategoricalWriter
const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());

if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
// For unsigned types, we need to convert to signed for pandas compatibility
// even when there are no nulls, so we skip the fast path
const bool is_unsigned = std::is_unsigned<T>::value;

if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
RETURN_NOT_OK(
CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));

Expand Down Expand Up @@ -2023,13 +2040,12 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
CATEGORICAL_CASE(Int16Type);
CATEGORICAL_CASE(Int32Type);
CATEGORICAL_CASE(Int64Type);
case Type::UINT8:
case Type::UINT16:
case Type::UINT32:
CATEGORICAL_CASE(UInt8Type);
CATEGORICAL_CASE(UInt16Type);
CATEGORICAL_CASE(UInt32Type);
case Type::UINT64:
return Status::TypeError(
"Converting unsigned dictionary indices to pandas",
" not yet supported, index type: ", index_type.ToString());
"Converting UInt64 dictionary indices to pandas is not supported.");
default:
// Unreachable
ARROW_DCHECK(false);
Expand Down
32 changes: 24 additions & 8 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4114,24 +4114,40 @@ def test_dictionary_with_pandas():
d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)

if index_type[0] == 'u':
# TODO: unsigned dictionary indices to pandas
with pytest.raises(TypeError):
if index_type == 'uint64':
# uint64 is not supported due to overflow risk (values > 2^63-1)
with pytest.raises(TypeError,
match="UInt64 dictionary indices"):
d1.to_pandas()
continue

pandas1 = d1.to_pandas()
ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
# Pandas Categorical uses signed int codes. Arrow converts:
# uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
if index_type == 'uint8':
compare_indices = indices.astype('int16')
elif index_type == 'uint16':
compare_indices = indices.astype('int32')
elif index_type == 'uint32':
compare_indices = indices.astype('int64')
else:
compare_indices = indices
ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary)

tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))

pandas2 = d2.to_pandas()
assert pandas2.isnull().sum() == 1

# Unsigned integers converted to signed
signed_indices = indices
if index_type[0] == 'u':
signed_indices = indices.astype(index_type[1:])
# Use same conversion as above for comparison
if index_type == 'uint8':
signed_indices = indices.astype('int16')
elif index_type == 'uint16':
signed_indices = indices.astype('int32')
elif index_type == 'uint32':
signed_indices = indices.astype('int64')
else:
signed_indices = indices
ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
signed_indices),
categories=dictionary)
Expand Down
Loading