-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-42018: Add numpy.StringDType support #48391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3b49f62
6e4c3c6
a90ea23
8729eb3
f49ba67
050ca86
da255c9
80a3aca
bef2c71
166dd05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,10 @@ | |
| namespace arrow { | ||
| namespace py { | ||
|
|
||
| #ifndef NPY_VSTRING | ||
| # define NPY_VSTRING 2056 | ||
| #endif | ||
|
|
||
| NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { | ||
| PyAcquireGIL lock; | ||
| arr_ = ao; | ||
|
|
@@ -122,6 +126,10 @@ Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) { | |
| return NumPyDtypeToArrow(descr); | ||
| } | ||
|
|
||
| bool IsStringDType(PyArray_Descr* descr) { | ||
| return descr != nullptr && descr->type_num == NPY_VSTRING; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The nullptr check seems superfluous, why would one call this function with a null pointer? |
||
| } | ||
|
|
||
| Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) { | ||
| if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { | ||
| return Status::TypeError("Did not pass numpy.dtype object"); | ||
|
|
@@ -133,6 +141,10 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) { | |
| Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) { | ||
| int type_num = fix_numpy_type_num(descr->type_num); | ||
|
|
||
| if (IsStringDType(descr)) { | ||
| return utf8(); | ||
| } | ||
|
|
||
| switch (type_num) { | ||
| TO_ARROW_TYPE_CASE(BOOL, boolean); | ||
| TO_ARROW_TYPE_CASE(INT8, int8); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |||||||||||||||||||||||
| #include <limits> | ||||||||||||||||||||||||
| #include <memory> | ||||||||||||||||||||||||
| #include <string> | ||||||||||||||||||||||||
| #include <string_view> | ||||||||||||||||||||||||
| #include <utility> | ||||||||||||||||||||||||
| #include <vector> | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
@@ -59,6 +60,8 @@ | |||||||||||||||||||||||
| #include "arrow/python/type_traits.h" | ||||||||||||||||||||||||
| #include "arrow/python/vendored/pythoncapi_compat.h" | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| #include <numpy/arrayobject.h> | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| namespace arrow { | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| using internal::checked_cast; | ||||||||||||||||||||||||
|
|
@@ -74,6 +77,27 @@ using internal::NumPyTypeSize; | |||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| namespace { | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| #if NPY_ABI_VERSION >= 0x02000000 | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why we're guarding with (also, would be nice if the NumPy docs were a bit more talkative about this)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ABI version is the compile time header version of NumPy. In this particular case (effectively using future API) Maybe there should be a section on "how to use future API depending on the NumPy runtime version" (although for things we really expect it, we may want to add it to the FWIW, I still think it makes most sense to wholesale copy-paste the NumPy header definitions. Then add some form of guard (and be it
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't seem to define It would be nice if these interactions were made clearer somewhere.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, except what you define would be
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also, can we assume that |
||||||||||||||||||||||||
| inline npy_string_allocator* ArrowNpyString_acquire_allocator( | ||||||||||||||||||||||||
| const PyArray_StringDTypeObject* descr) { | ||||||||||||||||||||||||
| using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); | ||||||||||||||||||||||||
| return reinterpret_cast<Func>(PyArray_API[316])(descr); | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not call #if NPY_FEATURE_VERSION >= NPY_2_0_API_VERSION
#define NpyString_acquire_allocator \
(*(npy_string_allocator * (*)(const PyArray_StringDTypeObject *)) \
PyArray_API[316])
#endif |
||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { | ||||||||||||||||||||||||
| using Func = void (*)(npy_string_allocator*); | ||||||||||||||||||||||||
| reinterpret_cast<Func>(PyArray_API[318])(allocator); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| inline int ArrowNpyString_load(npy_string_allocator* allocator, | ||||||||||||||||||||||||
| const npy_packed_static_string* packed, | ||||||||||||||||||||||||
| npy_static_string* out) { | ||||||||||||||||||||||||
| using Func = | ||||||||||||||||||||||||
| int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); | ||||||||||||||||||||||||
| return reinterpret_cast<Func>(PyArray_API[313])(allocator, packed, out); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
alippai marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
| #endif | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status AllocateNullBitmap(MemoryPool* pool, int64_t length, | ||||||||||||||||||||||||
| std::shared_ptr<ResizableBuffer>* out) { | ||||||||||||||||||||||||
| int64_t null_bytes = bit_util::BytesForBits(length); | ||||||||||||||||||||||||
|
|
@@ -233,6 +257,13 @@ class NumPyConverter { | |||||||||||||||||||||||
| Status Visit(const LargeStringType& type); | ||||||||||||||||||||||||
| Status Visit(const StringViewType& type); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| #if NPY_ABI_VERSION >= 0x02000000 | ||||||||||||||||||||||||
| template <typename Builder> | ||||||||||||||||||||||||
| Status AppendStringDTypeValues(Builder* builder); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status ConvertStringDType(); | ||||||||||||||||||||||||
| #endif | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status Visit(const StructType& type); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status Visit(const FixedSizeBinaryType& type); | ||||||||||||||||||||||||
|
|
@@ -338,6 +369,16 @@ Status NumPyConverter::Convert() { | |||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if (IsStringDType(dtype_)) { | ||||||||||||||||||||||||
| #if NPY_ABI_VERSION >= 0x02000000 | ||||||||||||||||||||||||
| RETURN_NOT_OK(ConvertStringDType()); | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's weird and confusing to do this outside of the visitor machinery. Is it possible to straighten that out?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also |
||||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| #else | ||||||||||||||||||||||||
| return Status::NotImplemented( | ||||||||||||||||||||||||
| "NumPy StringDType requires building PyArrow with NumPy >= 2.0"); | ||||||||||||||||||||||||
|
Comment on lines
+377
to
+378
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this at all possible to happen? I got the impression that one cannot use NumPy 2 if PyArrow was compiled for NumPy < 2, am I mistaken @ngoldbaum @seberg ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is correct, the NumPy C-API import will just barf at you and refuse to run. |
||||||||||||||||||||||||
| #endif | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if (type_ == nullptr) { | ||||||||||||||||||||||||
| return Status::Invalid("Must pass data type for non-object arrays"); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
@@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) { | |||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| #if NPY_ABI_VERSION >= 0x02000000 | ||||||||||||||||||||||||
| template <typename Builder> | ||||||||||||||||||||||||
| Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { | ||||||||||||||||||||||||
| auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); | ||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI for other reviewers: this locks a mutex internally in NumPy. |
||||||||||||||||||||||||
| if (allocator == nullptr) { | ||||||||||||||||||||||||
| return Status::Invalid("Failed to acquire NumPy StringDType allocator"); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| struct AllocatorGuard { | ||||||||||||||||||||||||
| npy_string_allocator* ptr; | ||||||||||||||||||||||||
| explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} | ||||||||||||||||||||||||
| ~AllocatorGuard() { | ||||||||||||||||||||||||
| if (ptr != nullptr) { | ||||||||||||||||||||||||
| ArrowNpyString_release_allocator(ptr); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } guard(allocator); | ||||||||||||||||||||||||
|
Comment on lines
+869
to
+877
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be written more concisely using
Suggested change
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| npy_static_string value = {0, nullptr}; | ||||||||||||||||||||||||
| char* data = PyArray_BYTES(arr_); | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make this |
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if (mask_ != nullptr) { | ||||||||||||||||||||||||
| Ndarray1DIndexer<uint8_t> mask_values(mask_); | ||||||||||||||||||||||||
| for (int64_t i = 0; i < length_; ++i) { | ||||||||||||||||||||||||
| if (mask_values[i]) { | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder->AppendNull()); | ||||||||||||||||||||||||
| continue; | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| const auto* packed = | ||||||||||||||||||||||||
| reinterpret_cast<const npy_packed_static_string*>(data + i * stride_); | ||||||||||||||||||||||||
|
Comment on lines
+890
to
+891
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just curious, is the StringDType layout documented somewhere? I couldn't find any reference easily in the NumPy docs. @ngoldbaum @seberg
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is described in the NEP, but |
||||||||||||||||||||||||
| const int is_null = ArrowNpyString_load(allocator, packed, &value); | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious, why does this NumPy API need an allocator is all it does is return a view into the array contents? Especially as no deallocation seems involved afterwards... @ngoldbaum @seberg
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://numpy.org/neps/nep-0055-string_dtype.html#memory-layout-and-managing-heap-allocations is the NEP about it. But basically, the string can't be stored in the array data itself, because NumPy requires a fixed size per element.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, but this is just loading an existing string. Why is an allocator needed for that? There is no corresponding deallocation call, it seems...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The "allocator" knows where the second buffer is. The other reason is that it adds locking so that we don't point to corrupted data for StringDType. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe "allocator" is the wrong name if it's confusing you. The allocator holds the reference to the per-descriptor memory pool as well as the per-descriptor lock. |
||||||||||||||||||||||||
| if (is_null == -1) { | ||||||||||||||||||||||||
| RETURN_IF_PYERROR(); | ||||||||||||||||||||||||
| return Status::Invalid("Failed to unpack NumPy StringDType value"); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if (is_null) { | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder->AppendNull()); | ||||||||||||||||||||||||
| } else { | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| for (int64_t i = 0; i < length_; ++i) { | ||||||||||||||||||||||||
| const auto* packed = reinterpret_cast<const npy_packed_static_string*>(data); | ||||||||||||||||||||||||
| const int is_null = ArrowNpyString_load(allocator, packed, &value); | ||||||||||||||||||||||||
| if (is_null == -1) { | ||||||||||||||||||||||||
| RETURN_IF_PYERROR(); | ||||||||||||||||||||||||
| return Status::Invalid("Failed to unpack NumPy StringDType value"); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if (is_null) { | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder->AppendNull()); | ||||||||||||||||||||||||
| } else { | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
Comment on lines
+907
to
+917
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could easily have this snippet factored out in a lambda to avoid code repetition with the loop above. |
||||||||||||||||||||||||
| data += stride_; | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status NumPyConverter::ConvertStringDType() { | ||||||||||||||||||||||||
| util::InitializeUTF8(); | ||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note this is only useful if we validate UTF8 values, which this doesn't seem to be doing? (presumably because NumPy already advertises the data as valid UTF8?) |
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if (type_ == nullptr) { | ||||||||||||||||||||||||
| type_ = utf8(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| switch (type_->id()) { | ||||||||||||||||||||||||
| case Type::STRING: { | ||||||||||||||||||||||||
| arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder.Reserve(length_)); | ||||||||||||||||||||||||
| RETURN_NOT_OK(AppendStringDTypeValues(&builder)); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| ArrayVector chunks; | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder.Finish(&chunks)); | ||||||||||||||||||||||||
| for (const auto& chunk : chunks) { | ||||||||||||||||||||||||
| RETURN_NOT_OK(PushArray(chunk->data())); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return Status::OK(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| case Type::LARGE_STRING: { | ||||||||||||||||||||||||
| LargeStringBuilder builder(pool_); | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder.Reserve(length_)); | ||||||||||||||||||||||||
| RETURN_NOT_OK(AppendStringDTypeValues(&builder)); | ||||||||||||||||||||||||
| return PushBuilderResult(&builder); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| case Type::STRING_VIEW: { | ||||||||||||||||||||||||
| StringViewBuilder builder(pool_); | ||||||||||||||||||||||||
| RETURN_NOT_OK(builder.Reserve(length_)); | ||||||||||||||||||||||||
| RETURN_NOT_OK(AppendStringDTypeValues(&builder)); | ||||||||||||||||||||||||
| return PushBuilderResult(&builder); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| default: | ||||||||||||||||||||||||
| return Status::TypeError( | ||||||||||||||||||||||||
| "NumPy StringDType can only be converted to Arrow string types"); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| #endif | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Status NumPyConverter::Visit(const StructType& type) { | ||||||||||||||||||||||||
| std::vector<NumPyConverter> sub_converters; | ||||||||||||||||||||||||
| std::vector<OwnedRefNoGIL> sub_arrays; | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type): | |
| assert arrow_arr.equals(expected) | ||
|
|
||
|
|
||
| @pytest.mark.numpy | ||
| def test_array_from_numpy_string_dtype(): | ||
| dtypes_mod = getattr(np, "dtypes", None) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can use something like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or, even, better, you can use a pytest fixture: @pytest.fixture
def string_dtype():
dtypes = pytest.importorskip("numpy.dtypes")
dtype_class = getattr(dtypes, "StringDType", None)
if dtype_class is None:
pytest.skip("NumPy StringDType not available (NumPy > 2 needed)")
return dtype_classand then simply: @pytest.mark.numpy
def test_array_from_numpy_string_dtype(string_dtype):
arr = np.array(["some", "strings"], dtype=string_dtype())
# etc. |
||
| if dtypes_mod is None: | ||
| pytest.skip("NumPy dtypes module not available") | ||
|
|
||
| StringDType = getattr(dtypes_mod, "StringDType", None) | ||
| if StringDType is None: | ||
| pytest.skip("NumPy StringDType not available") | ||
|
|
||
| dtype = StringDType() | ||
|
|
||
| arr = np.array(["some", "strings"], dtype=dtype) | ||
alippai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| arrow_arr = pa.array(arr) | ||
|
|
||
| assert arrow_arr.type == pa.utf8() | ||
| assert arrow_arr.to_pylist() == ["some", "strings"] | ||
|
|
||
| arrow_arr = pa.array(arr, type=pa.string()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that |
||
| assert arrow_arr.type == pa.string() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also call |
||
| assert arrow_arr.to_pylist() == ["some", "strings"] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can also be written |
||
|
|
||
| arrow_arr = pa.array(arr, type=pa.large_string()) | ||
| assert arrow_arr.type == pa.large_string() | ||
| assert arrow_arr.to_pylist() == ["some", "strings"] | ||
|
|
||
| arrow_arr = pa.array(arr, type=pa.string_view()) | ||
| assert arrow_arr.type == pa.string_view() | ||
| assert arrow_arr.to_pylist() == ["some", "strings"] | ||
|
|
||
| arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype) | ||
| arr = arr_full[::2] | ||
| arrow_arr = pa.array(arr) | ||
| assert arrow_arr.type == pa.utf8() | ||
| assert arrow_arr.to_pylist() == ["a", "c", "e"] | ||
|
|
||
|
|
||
| @pytest.mark.numpy | ||
| def test_numpy_stringdtype_thresholds_and_unicode(): | ||
| dtypes_mod = getattr(np, "dtypes", None) | ||
| if dtypes_mod is None: | ||
| pytest.skip("NumPy dtypes module not available") | ||
|
|
||
| StringDType = getattr(dtypes_mod, "StringDType", None) | ||
| if StringDType is None: | ||
| pytest.skip("NumPy StringDType not available") | ||
|
|
||
| dtype = StringDType() | ||
|
|
||
| short = "hello" | ||
| medium = "a" * 100 | ||
| long_ = "b" * 300 | ||
| unicode_ = "árvíztűrő tükörfúrógép 🥐 你好" | ||
| long_unicode = "🥐" * 200 | ||
|
|
||
| arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype) | ||
| assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode] | ||
|
|
||
|
|
||
| @pytest.mark.numpy | ||
| def test_array_from_numpy_string_dtype_nulls_and_mask(): | ||
| dtypes_mod = getattr(np, "dtypes", None) | ||
| if dtypes_mod is None: | ||
| pytest.skip("NumPy dtypes module not available") | ||
|
|
||
| StringDType = getattr(dtypes_mod, "StringDType", None) | ||
| if StringDType is None: | ||
| pytest.skip("NumPy StringDType not available") | ||
|
|
||
| # Real StringDType, use its NA sentinel | ||
| dtype = StringDType(na_object=None) | ||
| arr = np.array(["this array has", None, "as an entry"], dtype=dtype) | ||
|
|
||
| arrow_arr = pa.array(arr) | ||
| assert arrow_arr.type == pa.utf8() | ||
| assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] | ||
|
|
||
| # Test interplay of NA sentinel and an explicit mask: | ||
| # - index 1 is null because of na_object / Python None | ||
| # - index 2 is forced null by the mask | ||
| mask = np.array([False, False, True], dtype=bool) | ||
| arrow_arr = pa.array(arr, mask=mask) | ||
| assert arrow_arr.type == pa.utf8() | ||
| assert arrow_arr.null_count == 2 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just validate |
||
| assert arrow_arr.to_pylist() == ["this array has", None, None] | ||
|
|
||
| mask = np.array([True, False, True], dtype=bool) | ||
| assert pa.array(arr, mask=mask).to_pylist() == [None, None, None] | ||
|
|
||
|
|
||
| @pytest.mark.numpy | ||
| def test_array_from_numpy_string_dtype_string_sentinel_and_mask(): | ||
| dtypes_mod = getattr(np, "dtypes", None) | ||
| if dtypes_mod is None: | ||
| pytest.skip("NumPy dtypes module not available") | ||
|
|
||
| StringDType = getattr(dtypes_mod, "StringDType", None) | ||
| if StringDType is None: | ||
| pytest.skip("NumPy StringDType not available") | ||
|
|
||
| sentinel = "__placeholder__" | ||
| dtype = StringDType(na_object=sentinel) | ||
| arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype) | ||
|
|
||
| arrow_arr = pa.array(arr) | ||
| assert arrow_arr.type == pa.utf8() | ||
| assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] | ||
|
|
||
| mask = np.array([False, False, True], dtype=bool) | ||
| assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None] | ||
|
|
||
|
|
||
| @pytest.mark.numpy | ||
| def test_array_string_from_non_string(): | ||
| # ARROW-5682 - when converting to string raise on non string-like dtype | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we actually need this? Can
IsStringDTypejust return false if this constant is not defined (this is presumably when compiling with NumPy < 2)?