diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..caee39e49b8 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -364,15 +364,33 @@ Status CastBuffer(const std::shared_ptr& in_type, return Status::OK(); } +// Downcast buffer from FromType to ToType with optional overflow checking. +// This function only supports narrowing casts (FromType wider than ToType). +// Do not use this function for widening casts (ToType wider than FromType). template -Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool, - std::shared_ptr* out) { +Status StaticDowncastBuffer(const Buffer& input, int64_t length, MemoryPool* pool, + const uint8_t* null_bitmap, + const compute::CastOptions& cast_options, + std::shared_ptr* out) { ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool)); auto in_values = reinterpret_cast(input.data()); auto out_values = reinterpret_cast(result->mutable_data()); + + constexpr FromType kMin = std::numeric_limits::min(); + constexpr FromType kMax = std::numeric_limits::max(); + for (int64_t i = 0; i < length; ++i) { - *out_values++ = static_cast(*in_values++); + FromType value = *in_values++; + // Check overflow only when cast_options.allow_int_overflow is false and value is not + // null + bool check_overflow = !cast_options.allow_int_overflow && + ((null_bitmap == nullptr) || bit_util::GetBit(null_bitmap, i)); + if (check_overflow && (value < kMin || value > kMax)) { + return Status::Invalid("Integer value ", value, " out of bounds for int", + sizeof(ToType) * 8, " conversion at index ", i); + } + *out_values++ = static_cast(value); } *out = std::move(result); return Status::OK(); @@ -496,10 +514,10 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d // separately here from int64_t to int32_t, because this data is not // supported in compute::Cast if (date_dtype->meta.base == NPY_FR_D) { - // TODO(wesm): How pedantic do we really want to be about checking for int32 - // overflow here? - Status s = StaticCastBuffer(**data, length_, pool_, data); - RETURN_NOT_OK(s); + // Downcast from int64 to int32 with overflow checking + const uint8_t* null_bitmap_ptr = null_bitmap_ ? null_bitmap_->data() : nullptr; + RETURN_NOT_OK((StaticDowncastBuffer( + **data, length_, pool_, null_bitmap_ptr, cast_options_, data))); } else { ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..f0432113318 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2445,6 +2445,32 @@ def test_array_roundtrip_from_numpy_datetimeD(): assert result.dtype == arr.dtype +@pytest.mark.numpy +def test_array_from_numpy_datetime_overflow(): + # datetime64[D] to date32 conversion should check for int32 overflow + # when safe=True (default) + overflow_value = np.int64(3000000000) + arr = np.array([overflow_value], dtype='datetime64[D]') + with pytest.raises(pa.ArrowInvalid, match='value .* out of bounds'): + pa.array(arr, type=pa.date32()) + + underflow_value = np.int64(-3000000000) + arr = np.array([underflow_value], dtype='datetime64[D]') + with pytest.raises(pa.ArrowInvalid, match='value .* out of bounds'): + pa.array(arr, type=pa.date32()) + + # safe=False should allow overflow + result = pa.array(np.array([overflow_value], dtype='datetime64[D]'), + type=pa.date32(), safe=False) + assert len(result) == 1 + + # Values within int32 range should work + valid_arr = np.array([0, 100, -100, 2147483647, -2147483648], + dtype='datetime64[D]') + result = pa.array(valid_arr, type=pa.date32()) + assert len(result) == 5 + + def test_array_from_naive_datetimes(): arr = pa.array([ None,