apache · adamreeve · Dec 23, 2025 · Dec 11, 2025 · Dec 19, 2025 · Dec 23, 2025
@@ -1862,8 +1862,21 @@ class CategoricalWriter
   }
 
   Status WriteIndicesUniform(const ChunkedArray& data) {
-    RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
-    T* out_values = reinterpret_cast<T*>(this->block_data_);
+    // For unsigned types, upcast to signed since pandas uses -1 for nulls
+    // uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
+    using OutputType = std::conditional_t<
+        std::is_same<T, uint8_t>::value, int16_t,
+        std::conditional_t<
+            std::is_same<T, uint16_t>::value, int32_t,
+            std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>>;
+    const int npy_output_type = std::is_same<OutputType, int16_t>::value   ? NPY_INT16
+                                : std::is_same<OutputType, int32_t>::value ? NPY_INT32
+                                : std::is_same<OutputType, int64_t>::value
+                                    ? NPY_INT64
+                                    : TRAITS::npy_type;
+
+    RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
+    auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
 
     for (int c = 0; c < data.num_chunks(); c++) {
       const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
@@ -1874,7 +1887,7 @@ class CategoricalWriter
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr.length(); ++i) {
         if (indices.IsValid(i)) {
-          *out_values++ = values[i];
+          *out_values++ = static_cast<OutputType>(values[i]);
         } else {
           *out_values++ = -1;
         }
@@ -1927,7 +1940,11 @@ class CategoricalWriter
     const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
     const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
 
-    if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
+    // For unsigned types, we need to convert to signed for pandas compatibility
+    // even when there are no nulls, so we skip the fast path
+    const bool is_unsigned = std::is_unsigned<T>::value;
+
+    if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
       RETURN_NOT_OK(
           CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
 
@@ -2023,13 +2040,12 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
         CATEGORICAL_CASE(Int16Type);
         CATEGORICAL_CASE(Int32Type);
         CATEGORICAL_CASE(Int64Type);
-        case Type::UINT8:
-        case Type::UINT16:
-        case Type::UINT32:
+        CATEGORICAL_CASE(UInt8Type);
+        CATEGORICAL_CASE(UInt16Type);
+        CATEGORICAL_CASE(UInt32Type);
         case Type::UINT64:
           return Status::TypeError(
-              "Converting unsigned dictionary indices to pandas",
-              " not yet supported, index type: ", index_type.ToString());
+              "Converting UInt64 dictionary indices to pandas is not supported.");
         default:
           // Unreachable
           ARROW_DCHECK(false);

@@ -4114,24 +4114,40 @@ def test_dictionary_with_pandas():
         d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
         d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
 
-        if index_type[0] == 'u':
-            # TODO: unsigned dictionary indices to pandas
-            with pytest.raises(TypeError):
+        if index_type == 'uint64':
+            # uint64 is not supported due to overflow risk (values > 2^63-1)
+            with pytest.raises(TypeError,
+                               match="UInt64 dictionary indices"):
                 d1.to_pandas()
             continue
 
         pandas1 = d1.to_pandas()
-        ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
+        # Pandas Categorical uses signed int codes. Arrow converts:
+        # uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
+        if index_type == 'uint8':
+            compare_indices = indices.astype('int16')
+        elif index_type == 'uint16':
+            compare_indices = indices.astype('int32')
+        elif index_type == 'uint32':
+            compare_indices = indices.astype('int64')
+        else:
+            compare_indices = indices
+        ex_pandas1 = pd.Categorical.from_codes(compare_indices, categories=dictionary)
 
         tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
 
         pandas2 = d2.to_pandas()
         assert pandas2.isnull().sum() == 1
 
-        # Unsigned integers converted to signed
-        signed_indices = indices
-        if index_type[0] == 'u':
-            signed_indices = indices.astype(index_type[1:])
+        # Use same conversion as above for comparison
+        if index_type == 'uint8':
+            signed_indices = indices.astype('int16')
+        elif index_type == 'uint16':
+            signed_indices = indices.astype('int32')
+        elif index_type == 'uint32':
+            signed_indices = indices.astype('int64')
+        else:
+            signed_indices = indices
         ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
                                                         signed_indices),
                                                categories=dictionary)