GH-48672, GH-48465: [Python] Add an option for truncating intraday milliseconds in Date64

HyukjinKwon · HyukjinKwon · commit 3e75327e8084 · 2026-01-09T16:34:39.000+09:00
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -27,7 +27,8 @@ cdef extern from "<variant>" namespace "std":
     T get[T](...)
 
 cdef _sequence_to_array(object sequence, object mask, object size,
-                        DataType type, CMemoryPool* pool, c_bool from_pandas):
+                        DataType type, CMemoryPool* pool, c_bool from_pandas,
+                        bint truncate_date64_time):
     cdef:
         int64_t c_size
         PyConversionOptions options
@@ -41,6 +42,7 @@ cdef _sequence_to_array(object sequence, object mask, object size,
 
     options.from_pandas = from_pandas
     options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
+    options.truncate_date64_time = truncate_date64_time
 
     with nogil:
         chunked = GetResultValue(
@@ -81,15 +83,16 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,
 
 
 cdef _ndarray_to_array(object values, object mask, DataType type,
-                       c_bool from_pandas, c_bool safe, CMemoryPool* pool):
+                       c_bool from_pandas, c_bool safe, CMemoryPool* pool,
+                       bint truncate_date64_time):
     cdef:
         shared_ptr[CChunkedArray] chunked_out
         shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
         CCastOptions cast_options = CCastOptions(safe)
 
     with nogil:
         check_status(NdarrayToArrow(pool, values, mask, from_pandas,
-                                    c_type, cast_options, &chunked_out))
+                                    c_type, cast_options, truncate_date64_time, &chunked_out))
 
     if chunked_out.get().num_chunks() > 1:
         return pyarrow_wrap_chunked_array(chunked_out)
@@ -127,7 +130,7 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
 
 
 def array(object obj, type=None, mask=None, size=None, from_pandas=None,
-          bint safe=True, MemoryPool memory_pool=None):
+          bint safe=True, MemoryPool memory_pool=None, bint truncate_date64_time=True):
     """
     Create pyarrow.Array instance from a Python object.
 
@@ -162,6 +165,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
     memory_pool : pyarrow.MemoryPool, optional
         If not passed, will allocate memory from the currently-set default
         memory pool.
+    truncate_date64_time : bool, default True
+        If True (default), truncate intraday milliseconds when converting Python
+        datetime objects to date64.
+        If False, preserve the full datetime including time components.
 
     Returns
     -------
@@ -313,7 +320,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
         elif (pandas_api.is_categorical(values) and
               type is not None and type.id != Type_DICTIONARY):
             result = _ndarray_to_array(
-                np.asarray(values), mask, type, c_from_pandas, safe, pool
+                np.asarray(
+                    values), mask, type, c_from_pandas, safe, pool, truncate_date64_time
             )
         elif pandas_api.is_categorical(values):
             if type is not None:
@@ -358,21 +366,22 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
                     values, obj.dtype, type)
             if type and type.id == _Type_RUN_END_ENCODED:
                 arr = _ndarray_to_array(
-                    values, mask, type.value_type, c_from_pandas, safe, pool)
+                    values, mask, type.value_type, c_from_pandas, safe, pool, truncate_date64_time)
                 result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
                                               memory_pool=memory_pool)
             else:
                 result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
-                                           pool)
+                                           pool, truncate_date64_time)
     else:
         if type and type.id == _Type_RUN_END_ENCODED:
             arr = _sequence_to_array(
-                obj, mask, size, type.value_type, pool, from_pandas)
+                obj, mask, size, type.value_type, pool, from_pandas, truncate_date64_time)
             result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
                                           memory_pool=memory_pool)
         # ConvertPySequence does strict conversion if type is explicitly passed
         else:
-            result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
+            result = _sequence_to_array(
+                obj, mask, size, type, pool, c_from_pandas, truncate_date64_time)
 
     if extension_type is not None:
         result = ExtensionArray.from_storage(extension_type, result)
@@ -880,7 +889,8 @@ cdef class _PandasConvertible(_Weakrefable):
             bint self_destruct=False,
             str maps_as_pydicts=None,
             types_mapper=None,
-            bint coerce_temporal_nanoseconds=False
+            bint coerce_temporal_nanoseconds=False,
+            bint truncate_date64_time=False
     ):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -965,6 +975,10 @@ cdef class _PandasConvertible(_Weakrefable):
             default behavior in pandas version 1.x. Set this option to True if
             you'd like to use this coercion when using pandas version >= 2.0
             for backwards compatibility (not recommended otherwise).
+        truncate_date64_time : bool, default False
+            If True, truncate intraday milliseconds when converting date64 to pandas
+            datetime.
+            If False (default), preserve the full datetime including time components.
 
         Returns
         -------
@@ -1041,6 +1055,7 @@ cdef class _PandasConvertible(_Weakrefable):
             split_blocks=split_blocks,
             self_destruct=self_destruct,
             maps_as_pydicts=maps_as_pydicts,
+            truncate_date64_time=truncate_date64_time,
             coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
         )
         return self._to_pandas(options, categories=categories,
@@ -1063,6 +1078,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
     result.self_destruct = options['self_destruct']
     result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
     result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
+    result.truncate_date64_time = options['truncate_date64_time']
 
     maps_as_pydicts = options['maps_as_pydicts']
     if maps_as_pydicts is None:
diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
@@ -66,6 +66,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool from_pandas
         c_bool ignore_timezone
         c_bool strict
+        c_bool truncate_date64_time
 
     # TODO Some functions below are not actually "nogil"
 
@@ -81,12 +82,14 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
     CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
                            c_bool from_pandas,
                            const shared_ptr[CDataType]& type,
+                           c_bool truncate_date64_time,
                            shared_ptr[CChunkedArray]* out)
 
     CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
                            c_bool from_pandas,
                            const shared_ptr[CDataType]& type,
                            const CCastOptions& cast_options,
+                           c_bool truncate_date64_time,
                            shared_ptr[CChunkedArray]* out)
 
     CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
@@ -193,6 +196,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool coerce_temporal_nanoseconds
         c_bool ignore_timezone
         c_bool deduplicate_objects
+        c_bool truncate_date64_time
         c_bool safe_cast
         c_bool split_blocks
         c_bool self_destruct
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -597,7 +597,7 @@ def dataframe_to_types(df, preserve_index, columns=None):
 
 
 def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
-                        safe=True):
+                        safe=True, truncate_date64_time=True):
     (all_names,
      column_names,
      column_field_names,
@@ -630,7 +630,8 @@ def convert_column(col, field):
             type_ = field.type
 
         try:
-            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
+            result = pa.array(col, type=type_, from_pandas=True, safe=safe,
+                              truncate_date64_time=truncate_date64_time)
         except (pa.ArrowInvalid,
                 pa.ArrowNotImplementedError,
                 pa.ArrowTypeError) as e:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
@@ -1598,7 +1598,8 @@ cdef object get_scalar_class_from_type(
         return _scalar_classes[data_type.id()]
 
 
-def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
+def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None,
+           bint truncate_date64_time=True):
     """
     Create a pyarrow.Scalar instance from a Python object.
 
@@ -1616,6 +1617,10 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
     memory_pool : pyarrow.MemoryPool, optional
         If not passed, will allocate memory from the currently-set default
         memory pool.
+    truncate_date64_time : bool, default True
+        If True (default), truncate intraday milliseconds when converting Python
+        datetime objects to date64.
+        If False, preserve the full datetime including time components.
 
     Returns
     -------
@@ -1668,6 +1673,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
     else:
         options.from_pandas = from_pandas
 
+    options.truncate_date64_time = truncate_date64_time
+
     value = [value]
     with nogil:
         chunked = GetResultValue(ConvertPySequence(value, None, options, pool))
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -1547,6 +1547,26 @@ void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) {
   }
 }
 
+template <int64_t SHIFT>
+inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (arr.IsNull(i)) {
+        *out_values++ = kPandasTimestampNull;
+      } else {
+        int64_t truncated = in_values[i] - in_values[i] % kMillisecondsInDay;
+        if constexpr (SHIFT == 1) {
+          *out_values++ = truncated;
+        } else {
+          *out_values++ = truncated * SHIFT;
+        }
+      }
+    }
+  }
+}
+
 class DatetimeDayWriter : public TypedPandasWriter<NPY_DATETIME> {
  public:
   using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
@@ -1617,7 +1637,14 @@ class DatetimeMilliWriter : public DatetimeWriter<TimeUnit::MILLI> {
       // Convert from days since epoch to datetime64[ms]
       ConvertDatetime<int32_t, 86400000L>(*data, out_values);
     } else if (type == Type::DATE64) {
-      ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+      // Date64Type is millisecond timestamp
+      if (this->options_.truncate_date64_time) {
+        // Truncate intraday milliseconds
+        ConvertDatetimeWithTruncation<1L>(*data, out_values);
+      } else {
+        // Preserve time components
+        ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+      }
     } else {
       const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
       ARROW_DCHECK_EQ(TimeUnit::MILLI, ts_type.unit())
@@ -1652,9 +1679,14 @@ class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
       // Convert from days since epoch to datetime64[ns]
       ConvertDatetime<int32_t, kNanosecondsInDay>(*data, out_values);
     } else if (type == Type::DATE64) {
-      // Date64Type is millisecond timestamp stored as int64_t
-      // TODO(wesm): Do we want to make sure to zero out the milliseconds?
-      ConvertDatetime<int64_t, 1000000L>(*data, out_values);
+      // Date64Type is millisecond timestamp; convert to nanoseconds
+      if (this->options_.truncate_date64_time) {
+        // Truncate intraday milliseconds and convert to nanoseconds
+        ConvertDatetimeWithTruncation<1000000L>(*data, out_values);
+      } else {
+        // Preserve time components and convert to nanoseconds
+        ConvertDatetime<int64_t, 1000000L>(*data, out_values);
+      }
     } else if (type == Type::TIMESTAMP) {
       const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
 
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -89,6 +89,11 @@ struct PandasOptions {
   /// objects
   bool deduplicate_objects = false;
 
+  /// If true, truncate intraday milliseconds when converting date64 to pandas
+  /// datetime (default false to preserve time components).
+  /// If false, preserve the full datetime including time components.
+  bool truncate_date64_time = false;
+
   /// \brief For certain data types, a cast is needed in order to store the
   /// data in a pandas DataFrame or Series (e.g. timestamps are always stored
   /// as nanoseconds in pandas). This option controls whether it is a safe
diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
@@ -183,14 +183,15 @@ class NumPyConverter {
  public:
   NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
                  const std::shared_ptr<DataType>& type, bool from_pandas,
-                 const compute::CastOptions& cast_options = compute::CastOptions())
+                 const compute::CastOptions& cast_options, bool truncate_date64_time)
       : pool_(pool),
         type_(type),
         arr_(reinterpret_cast<PyArrayObject*>(arr)),
         dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         from_pandas_(from_pandas),
         cast_options_(cast_options),
+        truncate_date64_time_(truncate_date64_time),
         null_bitmap_data_(nullptr),
         null_count_(0) {
     if (mo != nullptr && mo != Py_None) {
@@ -311,6 +312,7 @@ class NumPyConverter {
 
   bool from_pandas_;
   compute::CastOptions cast_options_;
+  bool truncate_date64_time_;
 
   // Used in visitor pattern
   ArrayVector out_arrays_;
@@ -330,6 +332,7 @@ Status NumPyConverter::Convert() {
     PyConversionOptions py_options;
     py_options.type = type_;
     py_options.from_pandas = from_pandas_;
+    py_options.truncate_date64_time = truncate_date64_time_;
     ARROW_ASSIGN_OR_RAISE(
         auto chunked_array,
         ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
@@ -845,7 +848,7 @@ Status NumPyConverter::Visit(const StructType& type) {
       RETURN_IF_PYERROR();
       sub_arrays.emplace_back(sub_array);
       sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
-                                  from_pandas_);
+                                  from_pandas_, cast_options_, truncate_date64_time_);
     }
   }
 
@@ -916,7 +919,7 @@ Status NumPyConverter::Visit(const StructType& type) {
 
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
-                      const compute::CastOptions& cast_options,
+                      const compute::CastOptions& cast_options, bool truncate_date64_time,
                       std::shared_ptr<ChunkedArray>* out) {
   if (!PyArray_Check(ao)) {
     // This code path cannot be reached by Python unit tests currently so this
@@ -927,7 +930,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
+  NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options,
+                           truncate_date64_time);
   RETURN_NOT_OK(converter.Convert());
   const auto& output_arrays = converter.result();
   ARROW_DCHECK_GT(output_arrays.size(), 0);
@@ -938,7 +942,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
                       std::shared_ptr<ChunkedArray>* out) {
-  return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
+  return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), false,
+                        out);
 }
 
 }  // namespace py
diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.h b/python/pyarrow/src/arrow/python/numpy_to_arrow.h
@@ -46,11 +46,13 @@ namespace py {
 /// whether values are null
 /// \param[in] type a specific type to cast to, may be null
 /// \param[in] cast_options casting options
+/// \param[in] truncate_date64_time If true, truncate intraday milliseconds when
+/// converting Python datetime objects to date64 (default true)
 /// \param[out] out a ChunkedArray, to accommodate chunked output
 ARROW_PYTHON_EXPORT
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
-                      const compute::CastOptions& cast_options,
+                      const compute::CastOptions& cast_options, bool truncate_date64_time,
                       std::shared_ptr<ChunkedArray>* out);
 
 /// Safely convert NumPy arrays to Arrow. If target data type is not known,
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -303,14 +303,15 @@ class PyValue {
     return value;
   }
 
-  static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
+  static Result<int64_t> Convert(const Date64Type*, const O& options, I obj) {
     int64_t value;
     if (PyDateTime_Check(obj)) {
       auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
       value = internal::PyDateTime_to_ms(pydate);
-      // Truncate any intraday milliseconds
-      // TODO: introduce an option for this
-      value -= value % 86400000LL;
+      // Truncate any intraday milliseconds if the option is enabled
+      if (options.truncate_date64_time) {
+        value -= value % 86400000LL;
+      }
     } else if (PyDate_Check(obj)) {
       auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
       value = internal::PyDate_to_ms(pydate);
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.h b/python/pyarrow/src/arrow/python/python_to_arrow.h
diff --git a/python/pyarrow/src/arrow/python/type_traits.h b/python/pyarrow/src/arrow/python/type_traits.h
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py