Skip to content

Commit 7d3650e

Browse files
committed
[Python] Overflow check in datetime conversion in pa.array
1 parent 7a36fcc commit 7d3650e

File tree

2 files changed

+48
-8
lines changed

2 files changed

+48
-8
lines changed

python/pyarrow/src/arrow/python/numpy_to_arrow.cc

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -365,14 +365,27 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
365365
}
366366

367367
template <typename FromType, typename ToType>
368-
Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
368+
Status StaticCastBuffer(const Buffer& input, int64_t length, MemoryPool* pool,
369+
const uint8_t* null_bitmap,
370+
const compute::CastOptions& cast_options,
369371
std::shared_ptr<Buffer>* out) {
370372
ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));
371373

372374
auto in_values = reinterpret_cast<const FromType*>(input.data());
373375
auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
376+
377+
constexpr FromType kMin = std::numeric_limits<ToType>::min();
378+
constexpr FromType kMax = std::numeric_limits<ToType>::max();
379+
374380
for (int64_t i = 0; i < length; ++i) {
375-
*out_values++ = static_cast<ToType>(*in_values++);
381+
FromType value = *in_values++;
382+
// Skip overflow check for null values
383+
bool is_null = (null_bitmap != nullptr) && !bit_util::GetBit(null_bitmap, i);
384+
if (!is_null && !cast_options.allow_int_overflow && (value < kMin || value > kMax)) {
385+
return Status::Invalid("Integer value ", value, " out of bounds for int",
386+
sizeof(ToType) * 8, " conversion at index ", i);
387+
}
388+
*out_values++ = static_cast<ToType>(value);
376389
}
377390
*out = std::move(result);
378391
return Status::OK();
@@ -496,16 +509,17 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
496509
// separately here from int64_t to int32_t, because this data is not
497510
// supported in compute::Cast
498511
if (date_dtype->meta.base == NPY_FR_D) {
499-
// TODO(wesm): How pedantic do we really want to be about checking for int32
500-
// overflow here?
501-
Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
502-
RETURN_NOT_OK(s);
512+
// Downcast from int64 to int32 with overflow checking
513+
const uint8_t* null_bitmap_ptr = null_bitmap_ ? null_bitmap_->data() : nullptr;
514+
RETURN_NOT_OK((StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_,
515+
null_bitmap_ptr, cast_options_,
516+
data)));
503517
} else {
504518
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
505519
if (!input_type->Equals(*type_)) {
506520
// The null bitmap was already computed in VisitNative()
507-
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
508-
type_, cast_options_, pool_, data));
521+
RETURN_NOT_OK((StaticCastBuffer<int64_t, int32_t>(
522+
**data, length_, pool_, null_bitmap_ptr, cast_options_, data)));
509523
}
510524
}
511525
} else {

python/pyarrow/tests/test_array.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2445,6 +2445,32 @@ def test_array_roundtrip_from_numpy_datetimeD():
24452445
assert result.dtype == arr.dtype
24462446

24472447

2448+
@pytest.mark.numpy
2449+
def test_array_from_numpy_datetime_overflow():
2450+
# datetime64[D] to date32 conversion should check for int32 overflow
2451+
# when safe=True (default)
2452+
overflow_value = np.int64(3000000000)
2453+
arr = np.array([overflow_value], dtype='datetime64[D]')
2454+
with pytest.raises(pa.ArrowInvalid, match='value .* out of bounds'):
2455+
pa.array(arr, type=pa.date32())
2456+
2457+
underflow_value = np.int64(-3000000000)
2458+
arr = np.array([underflow_value], dtype='datetime64[D]')
2459+
with pytest.raises(pa.ArrowInvalid, match='value .* out of bounds'):
2460+
pa.array(arr, type=pa.date32())
2461+
2462+
# safe=False should allow overflow
2463+
result = pa.array(np.array([overflow_value], dtype='datetime64[D]'),
2464+
type=pa.date32(), safe=False)
2465+
assert len(result) == 1
2466+
2467+
# Values within int32 range should work
2468+
valid_arr = np.array([0, 100, -100, 2147483647, -2147483648],
2469+
dtype='datetime64[D]')
2470+
result = pa.array(valid_arr, type=pa.date32())
2471+
assert len(result) == 5
2472+
2473+
24482474
def test_array_from_naive_datetimes():
24492475
arr = pa.array([
24502476
None,

0 commit comments

Comments
 (0)