@@ -1862,8 +1862,20 @@ class CategoricalWriter
18621862 }
18631863
18641864 Status WriteIndicesUniform (const ChunkedArray& data) {
1865- RETURN_NOT_OK (this ->AllocateNDArray (TRAITS::npy_type, 1 ));
1866- T* out_values = reinterpret_cast <T*>(this ->block_data_ );
1865+ // For unsigned types, upcast to signed since pandas uses -1 for nulls
1866+ // uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
1867+ using OutputType = std::conditional_t <
1868+ std::is_same<T, uint8_t >::value, int16_t ,
1869+ std::conditional_t <std::is_same<T, uint16_t >::value, int32_t ,
1870+ std::conditional_t <std::is_same<T, uint32_t >::value, int64_t , T>>>;
1871+ const int npy_output_type = std::is_same<OutputType, int16_t >::value ? NPY_INT16
1872+ : std::is_same<OutputType, int32_t >::value ? NPY_INT32
1873+ : std::is_same<OutputType, int64_t >::value
1874+ ? NPY_INT64
1875+ : TRAITS::npy_type;
1876+
1877+ RETURN_NOT_OK (this ->AllocateNDArray (npy_output_type, 1 ));
1878+ auto out_values = reinterpret_cast <OutputType*>(this ->block_data_ );
18671879
18681880 for (int c = 0 ; c < data.num_chunks (); c++) {
18691881 const auto & arr = checked_cast<const DictionaryArray&>(*data.chunk (c));
@@ -1874,7 +1886,7 @@ class CategoricalWriter
18741886 // Null is -1 in CategoricalBlock
18751887 for (int i = 0 ; i < arr.length (); ++i) {
18761888 if (indices.IsValid (i)) {
1877- *out_values++ = values[i];
1889+ *out_values++ = static_cast <OutputType>( values[i]) ;
18781890 } else {
18791891 *out_values++ = -1 ;
18801892 }
@@ -1927,7 +1939,11 @@ class CategoricalWriter
19271939 const auto & arr_first = checked_cast<const DictionaryArray&>(*data.chunk (0 ));
19281940 const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices ());
19291941
1930- if (data.num_chunks () == 1 && indices_first->null_count () == 0 ) {
1942+ // For unsigned types, we need to convert to signed for pandas compatibility
1943+ // even when there are no nulls, so we skip the fast path
1944+ const bool is_unsigned = std::is_unsigned<T>::value;
1945+
1946+ if (data.num_chunks () == 1 && indices_first->null_count () == 0 && !is_unsigned) {
19311947 RETURN_NOT_OK (
19321948 CheckIndexBounds (*indices_first->data (), arr_first.dictionary ()->length ()));
19331949
@@ -2023,9 +2039,9 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
20232039 CATEGORICAL_CASE (Int16Type);
20242040 CATEGORICAL_CASE (Int32Type);
20252041 CATEGORICAL_CASE (Int64Type);
2026- case Type::UINT8:
2027- case Type::UINT16:
2028- case Type::UINT32:
2042+ CATEGORICAL_CASE (UInt8Type);
2043+ CATEGORICAL_CASE (UInt16Type);
2044+ CATEGORICAL_CASE (UInt32Type);
20292045 case Type::UINT64:
20302046 return Status::TypeError (
20312047 " Converting unsigned dictionary indices to pandas" ,
0 commit comments