Skip to content

Commit 3e75327

Browse files
committed
GH-48672, GH-48465: [Python] Add an option for truncating intraday milliseconds in Date64
1 parent 0bfbd19 commit 3e75327

File tree

13 files changed

+331
-31
lines changed

13 files changed

+331
-31
lines changed

python/pyarrow/array.pxi

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ cdef extern from "<variant>" namespace "std":
2727
T get[T](...)
2828

2929
cdef _sequence_to_array(object sequence, object mask, object size,
30-
DataType type, CMemoryPool* pool, c_bool from_pandas):
30+
DataType type, CMemoryPool* pool, c_bool from_pandas,
31+
bint truncate_date64_time):
3132
cdef:
3233
int64_t c_size
3334
PyConversionOptions options
@@ -41,6 +42,7 @@ cdef _sequence_to_array(object sequence, object mask, object size,
4142

4243
options.from_pandas = from_pandas
4344
options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
45+
options.truncate_date64_time = truncate_date64_time
4446

4547
with nogil:
4648
chunked = GetResultValue(
@@ -81,15 +83,16 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,
8183

8284

8385
cdef _ndarray_to_array(object values, object mask, DataType type,
84-
c_bool from_pandas, c_bool safe, CMemoryPool* pool):
86+
c_bool from_pandas, c_bool safe, CMemoryPool* pool,
87+
bint truncate_date64_time):
8588
cdef:
8689
shared_ptr[CChunkedArray] chunked_out
8790
shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
8891
CCastOptions cast_options = CCastOptions(safe)
8992

9093
with nogil:
9194
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
92-
c_type, cast_options, &chunked_out))
95+
c_type, cast_options, truncate_date64_time, &chunked_out))
9396

9497
if chunked_out.get().num_chunks() > 1:
9598
return pyarrow_wrap_chunked_array(chunked_out)
@@ -127,7 +130,7 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
127130

128131

129132
def array(object obj, type=None, mask=None, size=None, from_pandas=None,
130-
bint safe=True, MemoryPool memory_pool=None):
133+
bint safe=True, MemoryPool memory_pool=None, bint truncate_date64_time=True):
131134
"""
132135
Create pyarrow.Array instance from a Python object.
133136
@@ -162,6 +165,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
162165
memory_pool : pyarrow.MemoryPool, optional
163166
If not passed, will allocate memory from the currently-set default
164167
memory pool.
168+
truncate_date64_time : bool, default True
169+
If True (default), truncate intraday milliseconds when converting Python
170+
datetime objects to date64.
171+
If False, preserve the full datetime including time components.
165172
166173
Returns
167174
-------
@@ -313,7 +320,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
313320
elif (pandas_api.is_categorical(values) and
314321
type is not None and type.id != Type_DICTIONARY):
315322
result = _ndarray_to_array(
316-
np.asarray(values), mask, type, c_from_pandas, safe, pool
323+
np.asarray(
324+
values), mask, type, c_from_pandas, safe, pool, truncate_date64_time
317325
)
318326
elif pandas_api.is_categorical(values):
319327
if type is not None:
@@ -358,21 +366,22 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
358366
values, obj.dtype, type)
359367
if type and type.id == _Type_RUN_END_ENCODED:
360368
arr = _ndarray_to_array(
361-
values, mask, type.value_type, c_from_pandas, safe, pool)
369+
values, mask, type.value_type, c_from_pandas, safe, pool, truncate_date64_time)
362370
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
363371
memory_pool=memory_pool)
364372
else:
365373
result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
366-
pool)
374+
pool, truncate_date64_time)
367375
else:
368376
if type and type.id == _Type_RUN_END_ENCODED:
369377
arr = _sequence_to_array(
370-
obj, mask, size, type.value_type, pool, from_pandas)
378+
obj, mask, size, type.value_type, pool, from_pandas, truncate_date64_time)
371379
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
372380
memory_pool=memory_pool)
373381
# ConvertPySequence does strict conversion if type is explicitly passed
374382
else:
375-
result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
383+
result = _sequence_to_array(
384+
obj, mask, size, type, pool, c_from_pandas, truncate_date64_time)
376385

377386
if extension_type is not None:
378387
result = ExtensionArray.from_storage(extension_type, result)
@@ -880,7 +889,8 @@ cdef class _PandasConvertible(_Weakrefable):
880889
bint self_destruct=False,
881890
str maps_as_pydicts=None,
882891
types_mapper=None,
883-
bint coerce_temporal_nanoseconds=False
892+
bint coerce_temporal_nanoseconds=False,
893+
bint truncate_date64_time=False
884894
):
885895
"""
886896
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -965,6 +975,10 @@ cdef class _PandasConvertible(_Weakrefable):
965975
default behavior in pandas version 1.x. Set this option to True if
966976
you'd like to use this coercion when using pandas version >= 2.0
967977
for backwards compatibility (not recommended otherwise).
978+
truncate_date64_time : bool, default False
979+
If True, truncate intraday milliseconds when converting date64 to pandas
980+
datetime.
981+
If False (default), preserve the full datetime including time components.
968982
969983
Returns
970984
-------
@@ -1041,6 +1055,7 @@ cdef class _PandasConvertible(_Weakrefable):
10411055
split_blocks=split_blocks,
10421056
self_destruct=self_destruct,
10431057
maps_as_pydicts=maps_as_pydicts,
1058+
truncate_date64_time=truncate_date64_time,
10441059
coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
10451060
)
10461061
return self._to_pandas(options, categories=categories,
@@ -1063,6 +1078,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
10631078
result.self_destruct = options['self_destruct']
10641079
result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
10651080
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
1081+
result.truncate_date64_time = options['truncate_date64_time']
10661082

10671083
maps_as_pydicts = options['maps_as_pydicts']
10681084
if maps_as_pydicts is None:

python/pyarrow/includes/libarrow_python.pxd

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
6666
c_bool from_pandas
6767
c_bool ignore_timezone
6868
c_bool strict
69+
c_bool truncate_date64_time
6970

7071
# TODO Some functions below are not actually "nogil"
7172

@@ -81,12 +82,14 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
8182
CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
8283
c_bool from_pandas,
8384
const shared_ptr[CDataType]& type,
85+
c_bool truncate_date64_time,
8486
shared_ptr[CChunkedArray]* out)
8587

8688
CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
8789
c_bool from_pandas,
8890
const shared_ptr[CDataType]& type,
8991
const CCastOptions& cast_options,
92+
c_bool truncate_date64_time,
9093
shared_ptr[CChunkedArray]* out)
9194

9295
CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
@@ -193,6 +196,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
193196
c_bool coerce_temporal_nanoseconds
194197
c_bool ignore_timezone
195198
c_bool deduplicate_objects
199+
c_bool truncate_date64_time
196200
c_bool safe_cast
197201
c_bool split_blocks
198202
c_bool self_destruct

python/pyarrow/pandas_compat.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ def dataframe_to_types(df, preserve_index, columns=None):
597597

598598

599599
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
600-
safe=True):
600+
safe=True, truncate_date64_time=True):
601601
(all_names,
602602
column_names,
603603
column_field_names,
@@ -630,7 +630,8 @@ def convert_column(col, field):
630630
type_ = field.type
631631

632632
try:
633-
result = pa.array(col, type=type_, from_pandas=True, safe=safe)
633+
result = pa.array(col, type=type_, from_pandas=True, safe=safe,
634+
truncate_date64_time=truncate_date64_time)
634635
except (pa.ArrowInvalid,
635636
pa.ArrowNotImplementedError,
636637
pa.ArrowTypeError) as e:

python/pyarrow/scalar.pxi

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1598,7 +1598,8 @@ cdef object get_scalar_class_from_type(
15981598
return _scalar_classes[data_type.id()]
15991599

16001600

1601-
def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
1601+
def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None,
1602+
bint truncate_date64_time=True):
16021603
"""
16031604
Create a pyarrow.Scalar instance from a Python object.
16041605
@@ -1616,6 +1617,10 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
16161617
memory_pool : pyarrow.MemoryPool, optional
16171618
If not passed, will allocate memory from the currently-set default
16181619
memory pool.
1620+
truncate_date64_time : bool, default True
1621+
If True (default), truncate intraday milliseconds when converting Python
1622+
datetime objects to date64.
1623+
If False, preserve the full datetime including time components.
16191624
16201625
Returns
16211626
-------
@@ -1668,6 +1673,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
16681673
else:
16691674
options.from_pandas = from_pandas
16701675

1676+
options.truncate_date64_time = truncate_date64_time
1677+
16711678
value = [value]
16721679
with nogil:
16731680
chunked = GetResultValue(ConvertPySequence(value, None, options, pool))

python/pyarrow/src/arrow/python/arrow_to_pandas.cc

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1547,6 +1547,26 @@ void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) {
15471547
}
15481548
}
15491549

1550+
template <int64_t SHIFT>
1551+
inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
1552+
for (int c = 0; c < data.num_chunks(); c++) {
1553+
const auto& arr = *data.chunk(c);
1554+
const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
1555+
for (int64_t i = 0; i < arr.length(); ++i) {
1556+
if (arr.IsNull(i)) {
1557+
*out_values++ = kPandasTimestampNull;
1558+
} else {
1559+
int64_t truncated = in_values[i] - in_values[i] % kMillisecondsInDay;
1560+
if constexpr (SHIFT == 1) {
1561+
*out_values++ = truncated;
1562+
} else {
1563+
*out_values++ = truncated * SHIFT;
1564+
}
1565+
}
1566+
}
1567+
}
1568+
}
1569+
15501570
class DatetimeDayWriter : public TypedPandasWriter<NPY_DATETIME> {
15511571
public:
15521572
using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
@@ -1617,7 +1637,14 @@ class DatetimeMilliWriter : public DatetimeWriter<TimeUnit::MILLI> {
16171637
// Convert from days since epoch to datetime64[ms]
16181638
ConvertDatetime<int32_t, 86400000L>(*data, out_values);
16191639
} else if (type == Type::DATE64) {
1620-
ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
1640+
// Date64Type is millisecond timestamp
1641+
if (this->options_.truncate_date64_time) {
1642+
// Truncate intraday milliseconds
1643+
ConvertDatetimeWithTruncation<1L>(*data, out_values);
1644+
} else {
1645+
// Preserve time components
1646+
ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
1647+
}
16211648
} else {
16221649
const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
16231650
ARROW_DCHECK_EQ(TimeUnit::MILLI, ts_type.unit())
@@ -1652,9 +1679,14 @@ class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
16521679
// Convert from days since epoch to datetime64[ns]
16531680
ConvertDatetime<int32_t, kNanosecondsInDay>(*data, out_values);
16541681
} else if (type == Type::DATE64) {
1655-
// Date64Type is millisecond timestamp stored as int64_t
1656-
// TODO(wesm): Do we want to make sure to zero out the milliseconds?
1657-
ConvertDatetime<int64_t, 1000000L>(*data, out_values);
1682+
// Date64Type is millisecond timestamp; convert to nanoseconds
1683+
if (this->options_.truncate_date64_time) {
1684+
// Truncate intraday milliseconds and convert to nanoseconds
1685+
ConvertDatetimeWithTruncation<1000000L>(*data, out_values);
1686+
} else {
1687+
// Preserve time components and convert to nanoseconds
1688+
ConvertDatetime<int64_t, 1000000L>(*data, out_values);
1689+
}
16581690
} else if (type == Type::TIMESTAMP) {
16591691
const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
16601692

python/pyarrow/src/arrow/python/arrow_to_pandas.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ struct PandasOptions {
8989
/// objects
9090
bool deduplicate_objects = false;
9191

92+
/// If true, truncate intraday milliseconds when converting date64 to pandas
93+
/// datetime (default false to preserve time components).
94+
/// If false, preserve the full datetime including time components.
95+
bool truncate_date64_time = false;
96+
9297
/// \brief For certain data types, a cast is needed in order to store the
9398
/// data in a pandas DataFrame or Series (e.g. timestamps are always stored
9499
/// as nanoseconds in pandas). This option controls whether it is a safe

python/pyarrow/src/arrow/python/numpy_to_arrow.cc

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,15 @@ class NumPyConverter {
183183
public:
184184
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
185185
const std::shared_ptr<DataType>& type, bool from_pandas,
186-
const compute::CastOptions& cast_options = compute::CastOptions())
186+
const compute::CastOptions& cast_options, bool truncate_date64_time)
187187
: pool_(pool),
188188
type_(type),
189189
arr_(reinterpret_cast<PyArrayObject*>(arr)),
190190
dtype_(PyArray_DESCR(arr_)),
191191
mask_(nullptr),
192192
from_pandas_(from_pandas),
193193
cast_options_(cast_options),
194+
truncate_date64_time_(truncate_date64_time),
194195
null_bitmap_data_(nullptr),
195196
null_count_(0) {
196197
if (mo != nullptr && mo != Py_None) {
@@ -311,6 +312,7 @@ class NumPyConverter {
311312

312313
bool from_pandas_;
313314
compute::CastOptions cast_options_;
315+
bool truncate_date64_time_;
314316

315317
// Used in visitor pattern
316318
ArrayVector out_arrays_;
@@ -330,6 +332,7 @@ Status NumPyConverter::Convert() {
330332
PyConversionOptions py_options;
331333
py_options.type = type_;
332334
py_options.from_pandas = from_pandas_;
335+
py_options.truncate_date64_time = truncate_date64_time_;
333336
ARROW_ASSIGN_OR_RAISE(
334337
auto chunked_array,
335338
ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
@@ -845,7 +848,7 @@ Status NumPyConverter::Visit(const StructType& type) {
845848
RETURN_IF_PYERROR();
846849
sub_arrays.emplace_back(sub_array);
847850
sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
848-
from_pandas_);
851+
from_pandas_, cast_options_, truncate_date64_time_);
849852
}
850853
}
851854

@@ -916,7 +919,7 @@ Status NumPyConverter::Visit(const StructType& type) {
916919

917920
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
918921
const std::shared_ptr<DataType>& type,
919-
const compute::CastOptions& cast_options,
922+
const compute::CastOptions& cast_options, bool truncate_date64_time,
920923
std::shared_ptr<ChunkedArray>* out) {
921924
if (!PyArray_Check(ao)) {
922925
// This code path cannot be reached by Python unit tests currently so this
@@ -927,7 +930,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
927930
return Status::Invalid("only handle 1-dimensional arrays");
928931
}
929932

930-
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
933+
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options,
934+
truncate_date64_time);
931935
RETURN_NOT_OK(converter.Convert());
932936
const auto& output_arrays = converter.result();
933937
ARROW_DCHECK_GT(output_arrays.size(), 0);
@@ -938,7 +942,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
938942
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
939943
const std::shared_ptr<DataType>& type,
940944
std::shared_ptr<ChunkedArray>* out) {
941-
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
945+
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), false,
946+
out);
942947
}
943948

944949
} // namespace py

python/pyarrow/src/arrow/python/numpy_to_arrow.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@ namespace py {
4646
/// whether values are null
4747
/// \param[in] type a specific type to cast to, may be null
4848
/// \param[in] cast_options casting options
49+
/// \param[in] truncate_date64_time If true, truncate intraday milliseconds when
50+
/// converting Python datetime objects to date64 (default true)
4951
/// \param[out] out a ChunkedArray, to accommodate chunked output
5052
ARROW_PYTHON_EXPORT
5153
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
5254
const std::shared_ptr<DataType>& type,
53-
const compute::CastOptions& cast_options,
55+
const compute::CastOptions& cast_options, bool truncate_date64_time,
5456
std::shared_ptr<ChunkedArray>* out);
5557

5658
/// Safely convert NumPy arrays to Arrow. If target data type is not known,

python/pyarrow/src/arrow/python/python_to_arrow.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,14 +303,15 @@ class PyValue {
303303
return value;
304304
}
305305

306-
static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
306+
static Result<int64_t> Convert(const Date64Type*, const O& options, I obj) {
307307
int64_t value;
308308
if (PyDateTime_Check(obj)) {
309309
auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
310310
value = internal::PyDateTime_to_ms(pydate);
311-
// Truncate any intraday milliseconds
312-
// TODO: introduce an option for this
313-
value -= value % 86400000LL;
311+
// Truncate any intraday milliseconds if the option is enabled
312+
if (options.truncate_date64_time) {
313+
value -= value % 86400000LL;
314+
}
314315
} else if (PyDate_Check(obj)) {
315316
auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
316317
value = internal::PyDate_to_ms(pydate);

0 commit comments

Comments
 (0)