apache · alippai · Dec 7, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
@@ -37,6 +37,10 @@
 namespace arrow {
 namespace py {
 
+#ifndef NPY_VSTRING
+#  define NPY_VSTRING 2056
+#endif
+
 NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
   PyAcquireGIL lock;
   arr_ = ao;
@@ -122,6 +126,10 @@ Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) {
   return NumPyDtypeToArrow(descr);
 }
 
+bool IsStringDType(PyArray_Descr* descr) {
+  return descr != nullptr && descr->type_num == NPY_VSTRING;
+}
+
 Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
   if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
     return Status::TypeError("Did not pass numpy.dtype object");
@@ -133,6 +141,10 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
 Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
   int type_num = fix_numpy_type_num(descr->type_num);
 
+  if (IsStringDType(descr)) {
+    return utf8();
+  }
+
   switch (type_num) {
     TO_ARROW_TYPE_CASE(BOOL, boolean);
     TO_ARROW_TYPE_CASE(INT8, int8);

@@ -55,6 +55,8 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
 ARROW_PYTHON_EXPORT
 Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);
 
+ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr);
+
 ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
                                            const std::vector<std::string>& dim_names,
                                            std::shared_ptr<Tensor>* out);

@@ -27,6 +27,7 @@
 #include <limits>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -59,6 +60,8 @@
 #include "arrow/python/type_traits.h"
 #include "arrow/python/vendored/pythoncapi_compat.h"
 
+#include <numpy/arrayobject.h>
+
 namespace arrow {
 
 using internal::checked_cast;
@@ -74,6 +77,27 @@ using internal::NumPyTypeSize;
 
 namespace {
 
+#if NPY_ABI_VERSION >= 0x02000000
+inline npy_string_allocator* ArrowNpyString_acquire_allocator(
+    const PyArray_StringDTypeObject* descr) {
+  using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
+  return reinterpret_cast<Func>(PyArray_API[316])(descr);
+}
+
+inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) {
+  using Func = void (*)(npy_string_allocator*);
+  reinterpret_cast<Func>(PyArray_API[318])(allocator);
+}
+
+inline int ArrowNpyString_load(npy_string_allocator* allocator,
+                               const npy_packed_static_string* packed,
+                               npy_static_string* out) {
+  using Func =
+      int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*);
+  return reinterpret_cast<Func>(PyArray_API[313])(allocator, packed, out);
+}
+#endif
+
 Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
                           std::shared_ptr<ResizableBuffer>* out) {
   int64_t null_bytes = bit_util::BytesForBits(length);
@@ -233,6 +257,13 @@ class NumPyConverter {
   Status Visit(const LargeStringType& type);
   Status Visit(const StringViewType& type);
 
+#if NPY_ABI_VERSION >= 0x02000000
+  template <typename Builder>
+  Status AppendStringDTypeValues(Builder* builder);
+
+  Status ConvertStringDType();
+#endif
+
   Status Visit(const StructType& type);
 
   Status Visit(const FixedSizeBinaryType& type);
@@ -338,6 +369,16 @@ Status NumPyConverter::Convert() {
     return Status::OK();
   }
 
+  if (IsStringDType(dtype_)) {
+#if NPY_ABI_VERSION >= 0x02000000
+    RETURN_NOT_OK(ConvertStringDType());
+    return Status::OK();
+#else
+    return Status::NotImplemented(
+        "NumPy StringDType requires building PyArrow with NumPy >= 2.0");
+#endif
+  }
+
   if (type_ == nullptr) {
     return Status::Invalid("Must pass data type for non-object arrays");
   }
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
   return Status::OK();
 }
 
+#if NPY_ABI_VERSION >= 0x02000000
+template <typename Builder>
+Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
+  auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);
+
+  npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+  if (allocator == nullptr) {
+    return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+  }
+
+  struct AllocatorGuard {
+    npy_string_allocator* ptr;
+    explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
+    ~AllocatorGuard() {
+      if (ptr != nullptr) {
+        ArrowNpyString_release_allocator(ptr);
+      }
+    }
+  } guard(allocator);
-  struct AllocatorGuard {
-    npy_string_allocator* ptr;
-    explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
-    ~AllocatorGuard() {
-      if (ptr != nullptr) {
-        ArrowNpyString_release_allocator(ptr);
-      }
-    }
-  } guard(allocator);
+  std::unique_ptr<npy_string_allocator, void(*)(npy_string_allocator*)>
+      allocator_guard(allocator, &ArrowNpyString_release_allocator);
-  struct AllocatorGuard {
-    npy_string_allocator* ptr;
-    explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
-    ~AllocatorGuard() {
-      if (ptr != nullptr) {
-        ArrowNpyString_release_allocator(ptr);
-      }
-    }
-  } guard(allocator);
+  std::unique_ptr<npy_string_allocator, void(*)(npy_string_allocator*)>
+      allocator_guard(allocator, &ArrowNpyString_release_allocator);
+
+  npy_static_string value = {0, nullptr};
+  char* data = PyArray_BYTES(arr_);
+
+  if (mask_ != nullptr) {
+    Ndarray1DIndexer<uint8_t> mask_values(mask_);
+    for (int64_t i = 0; i < length_; ++i) {
+      if (mask_values[i]) {
+        RETURN_NOT_OK(builder->AppendNull());
+        continue;
+      }
+
+      const auto* packed =
+          reinterpret_cast<const npy_packed_static_string*>(data + i * stride_);
+      const int is_null = ArrowNpyString_load(allocator, packed, &value);
+      if (is_null == -1) {
+        RETURN_IF_PYERROR();
+        return Status::Invalid("Failed to unpack NumPy StringDType value");
+      }
+      if (is_null) {
+        RETURN_NOT_OK(builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
+      }
+    }
+    return Status::OK();
+  }
+
+  for (int64_t i = 0; i < length_; ++i) {
+    const auto* packed = reinterpret_cast<const npy_packed_static_string*>(data);
+    const int is_null = ArrowNpyString_load(allocator, packed, &value);
+    if (is_null == -1) {
+      RETURN_IF_PYERROR();
+      return Status::Invalid("Failed to unpack NumPy StringDType value");
+    }
+    if (is_null) {
+      RETURN_NOT_OK(builder->AppendNull());
+    } else {
+      RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
+    }
+    data += stride_;
+  }
+
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertStringDType() {
+  util::InitializeUTF8();
+
+  if (type_ == nullptr) {
+    type_ = utf8();
+  }
+
+  switch (type_->id()) {
+    case Type::STRING: {
+      arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
+      RETURN_NOT_OK(builder.Reserve(length_));
+      RETURN_NOT_OK(AppendStringDTypeValues(&builder));
+
+      ArrayVector chunks;
+      RETURN_NOT_OK(builder.Finish(&chunks));
+      for (const auto& chunk : chunks) {
+        RETURN_NOT_OK(PushArray(chunk->data()));
+      }
+      return Status::OK();
+    }
+    case Type::LARGE_STRING: {
+      LargeStringBuilder builder(pool_);
+      RETURN_NOT_OK(builder.Reserve(length_));
+      RETURN_NOT_OK(AppendStringDTypeValues(&builder));
+      return PushBuilderResult(&builder);
+    }
+    case Type::STRING_VIEW: {
+      StringViewBuilder builder(pool_);
+      RETURN_NOT_OK(builder.Reserve(length_));
+      RETURN_NOT_OK(AppendStringDTypeValues(&builder));
+      return PushBuilderResult(&builder);
+    }
+    default:
+      return Status::TypeError(
+          "NumPy StringDType can only be converted to Arrow string types");
+  }
+}
+#endif
+
 Status NumPyConverter::Visit(const StructType& type) {
   std::vector<NumPyConverter> sub_converters;
   std::vector<OwnedRefNoGIL> sub_arrays;

@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
     assert arrow_arr.equals(expected)
 
 
+@pytest.mark.numpy
+def test_array_from_numpy_string_dtype():
+    dtypes_mod = getattr(np, "dtypes", None)
+    if dtypes_mod is None:
+        pytest.skip("NumPy dtypes module not available")
+
+    StringDType = getattr(dtypes_mod, "StringDType", None)
+    if StringDType is None:
+        pytest.skip("NumPy StringDType not available")
+
+    dtype = StringDType()
+
+    arr = np.array(["some", "strings"], dtype=dtype)
+
+    arrow_arr = pa.array(arr)
+
+    assert arrow_arr.type == pa.utf8()
+    assert arrow_arr.to_pylist() == ["some", "strings"]
+
+    arrow_arr = pa.array(arr, type=pa.string())
+    assert arrow_arr.type == pa.string()
+    assert arrow_arr.to_pylist() == ["some", "strings"]
+
+    arrow_arr = pa.array(arr, type=pa.large_string())
+    assert arrow_arr.type == pa.large_string()
+    assert arrow_arr.to_pylist() == ["some", "strings"]
+
+    arrow_arr = pa.array(arr, type=pa.string_view())
+    assert arrow_arr.type == pa.string_view()
+    assert arrow_arr.to_pylist() == ["some", "strings"]
+
+    arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype)
+    arr = arr_full[::2]
+    arrow_arr = pa.array(arr)
+    assert arrow_arr.type == pa.utf8()
+    assert arrow_arr.to_pylist() == ["a", "c", "e"]
+
+
+@pytest.mark.numpy
+def test_numpy_stringdtype_thresholds_and_unicode():
+    dtypes_mod = getattr(np, "dtypes", None)
+    if dtypes_mod is None:
+        pytest.skip("NumPy dtypes module not available")
+
+    StringDType = getattr(dtypes_mod, "StringDType", None)
+    if StringDType is None:
+        pytest.skip("NumPy StringDType not available")
+
+    dtype = StringDType()
+
+    short = "hello"
+    medium = "a" * 100
+    long_ = "b" * 300
+    unicode_ = "árvíztűrő tükörfúrógép 🥐 你好"
+    long_unicode = "🥐" * 200
+
+    arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype)
+    assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode]
+
+
+@pytest.mark.numpy
+def test_array_from_numpy_string_dtype_nulls_and_mask():
+    dtypes_mod = getattr(np, "dtypes", None)
+    if dtypes_mod is None:
+        pytest.skip("NumPy dtypes module not available")
+
+    StringDType = getattr(dtypes_mod, "StringDType", None)
+    if StringDType is None:
+        pytest.skip("NumPy StringDType not available")
+
+    # Real StringDType, use its NA sentinel
+    dtype = StringDType(na_object=None)
+    arr = np.array(["this array has", None, "as an entry"], dtype=dtype)
+
+    arrow_arr = pa.array(arr)
+    assert arrow_arr.type == pa.utf8()
+    assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"]
+
+    # Test interplay of NA sentinel and an explicit mask:
+    # - index 1 is null because of na_object / Python None
+    # - index 2 is forced null by the mask
+    mask = np.array([False, False, True], dtype=bool)
+    arrow_arr = pa.array(arr, mask=mask)
+    assert arrow_arr.type == pa.utf8()
+    assert arrow_arr.null_count == 2
+    assert arrow_arr.to_pylist() == ["this array has", None, None]
+
+    mask = np.array([True, False, True], dtype=bool)
+    assert pa.array(arr, mask=mask).to_pylist() == [None, None, None]
+
+
+@pytest.mark.numpy
+def test_array_from_numpy_string_dtype_string_sentinel_and_mask():
+    dtypes_mod = getattr(np, "dtypes", None)
+    if dtypes_mod is None:
+        pytest.skip("NumPy dtypes module not available")
+
+    StringDType = getattr(dtypes_mod, "StringDType", None)
+    if StringDType is None:
+        pytest.skip("NumPy StringDType not available")
+
+    sentinel = "__placeholder__"
+    dtype = StringDType(na_object=sentinel)
+    arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype)
+
+    arrow_arr = pa.array(arr)
+    assert arrow_arr.type == pa.utf8()
+    assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"]
+
+    mask = np.array([False, False, True], dtype=bool)
+    assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None]
+
+
 @pytest.mark.numpy
 def test_array_string_from_non_string():
     # ARROW-5682 - when converting to string raise on non string-like dtype