Skip to content

Commit 324d25e

Browse files
authored
ARROW-191 Add support for date32 and date64 types (#192)
* ARROW-191 Add support for date32 and date64 types * update changelog
1 parent 7130138 commit 324d25e

File tree

8 files changed

+183
-6
lines changed

8 files changed

+183
-6
lines changed

bindings/python/docs/source/changelog.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Changelog
44
Changes in Version 1.3.0
55
------------------------
66
- Support for Polars
7-
- Support for PyArrow.DataTypes: large_list, large_string
7+
- Support for PyArrow.DataTypes: large_list, large_string, date32, date64
88

99
Changes in Version 1.2.0
1010
------------------------

bindings/python/pymongoarrow/api.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
from bson.raw_bson import RawBSONDocument
2929
from numpy import ndarray
3030
from pyarrow import Schema as ArrowSchema
31-
from pyarrow import Table
31+
from pyarrow import Table, timestamp
32+
from pyarrow.types import is_date32, is_date64
3233
from pymongo.bulk import BulkWriteError
3334
from pymongo.common import MAX_WRITE_BATCH_SIZE
3435

@@ -430,6 +431,17 @@ def write(collection, tabular):
430431
}
431432
tab_size = len(tabular)
432433
if isinstance(tabular, Table):
434+
# Convert date objects to datetime objects.
435+
changed = False
436+
new_types = []
437+
for dtype in tabular.schema.types:
438+
if is_date32(dtype) or is_date64(dtype):
439+
changed = True
440+
dtype = timestamp("ms") # noqa: PLW2901
441+
new_types.append(dtype)
442+
if changed:
443+
cols = [tabular.column(i).cast(new_types[i]) for i in range(tabular.num_columns)]
444+
tabular = Table.from_arrays(cols, names=tabular.column_names)
433445
_validate_schema(tabular.schema.types)
434446
elif isinstance(tabular, pd.DataFrame):
435447
_validate_schema(ArrowSchema.from_pandas(tabular).types)

bindings/python/pymongoarrow/context.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
BinaryBuilder,
2222
BoolBuilder,
2323
CodeBuilder,
24+
Date32Builder,
25+
Date64Builder,
2426
DatetimeBuilder,
2527
Decimal128Builder,
2628
DocumentBuilder,
@@ -45,6 +47,8 @@
4547
_BsonArrowTypes.array: ListBuilder,
4648
_BsonArrowTypes.binary: BinaryBuilder,
4749
_BsonArrowTypes.code: CodeBuilder,
50+
_BsonArrowTypes.date32: Date32Builder,
51+
_BsonArrowTypes.date64: Date64Builder,
4852
}
4953
except ImportError:
5054
pass

bindings/python/pymongoarrow/lib.pyx

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ cdef const bson_t* bson_reader_read_safe(bson_reader_t* stream_reader) except? N
6060
raise InvalidBSON("Could not read BSON document stream")
6161
return doc
6262

63+
64+
# Placeholder numbers for the date types.
65+
cdef uint8_t ARROW_TYPE_DATE32 = 100
66+
cdef uint8_t ARROW_TYPE_DATE64 = 101
67+
6368
_builder_type_map = {
6469
BSON_TYPE_INT32: Int32Builder,
6570
BSON_TYPE_INT64: Int64Builder,
@@ -73,6 +78,8 @@ _builder_type_map = {
7378
BSON_TYPE_ARRAY: ListBuilder,
7479
BSON_TYPE_BINARY: BinaryBuilder,
7580
BSON_TYPE_CODE: CodeBuilder,
81+
ARROW_TYPE_DATE32: Date32Builder,
82+
ARROW_TYPE_DATE64: Date64Builder,
7683
}
7784

7885
_field_type_map = {
@@ -188,6 +195,8 @@ cdef void process_raw_bson_stream(const uint8_t * docstream, size_t length, obje
188195
cdef Decimal128Builder dec128_builder
189196
cdef ListBuilder list_builder
190197
cdef DocumentBuilder doc_builder
198+
cdef Date32Builder date32_builder
199+
cdef Date64Builder date64_builder
191200

192201
# Build up a map of the builders.
193202
for key, value in context.builder_map.items():
@@ -249,6 +258,10 @@ cdef void process_raw_bson_stream(const uint8_t * docstream, size_t length, obje
249258
bson_iter_binary (&doc_iter, &subtype,
250259
&val_buf_len, &val_buf)
251260
builder = BinaryBuilder(subtype)
261+
elif builder_type == Date32Builder:
262+
builder = Date32Builder()
263+
elif builder_type == Date64Builder:
264+
builder = Date64Builder()
252265
else:
253266
builder = builder_type()
254267
if arr_value_builder is None:
@@ -346,6 +359,18 @@ cdef void process_raw_bson_stream(const uint8_t * docstream, size_t length, obje
346359
double_builder.append_raw(bson_iter_as_double(&doc_iter))
347360
else:
348361
double_builder.append_null()
362+
elif ftype == ARROW_TYPE_DATE32:
363+
date32_builder = builder
364+
if value_t == BSON_TYPE_DATE_TIME:
365+
date32_builder.append_raw(bson_iter_date_time(&doc_iter))
366+
else:
367+
date32_builder.append_null()
368+
elif ftype == ARROW_TYPE_DATE64:
369+
date64_builder = builder
370+
if value_t == BSON_TYPE_DATE_TIME:
371+
date64_builder.append_raw(bson_iter_date_time(&doc_iter))
372+
else:
373+
date64_builder.append_null()
349374
elif ftype == BSON_TYPE_DATE_TIME:
350375
datetime_builder = builder
351376
if value_t == BSON_TYPE_DATE_TIME:
@@ -626,6 +651,78 @@ cdef class DatetimeBuilder(_ArrayBuilderBase):
626651
cdef shared_ptr[CTimestampBuilder] unwrap(self):
627652
return self.builder
628653

654+
cdef class Date64Builder(_ArrayBuilderBase):
655+
cdef:
656+
shared_ptr[CDate64Builder] builder
657+
DataType dtype
658+
659+
def __cinit__(self, MemoryPool memory_pool=None):
660+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
661+
self.builder.reset(new CDate64Builder(pool))
662+
self.type_marker = ARROW_TYPE_DATE64
663+
664+
cdef append_raw(self, int64_t value):
665+
self.builder.get().Append(value)
666+
667+
cpdef append(self, value):
668+
self.builder.get().Append(value)
669+
670+
cpdef append_null(self):
671+
self.builder.get().AppendNull()
672+
673+
def __len__(self):
674+
return self.builder.get().length()
675+
676+
@property
677+
def unit(self):
678+
return self.dtype
679+
680+
cpdef finish(self):
681+
cdef shared_ptr[CArray] out
682+
with nogil:
683+
self.builder.get().Finish(&out)
684+
return pyarrow_wrap_array(out)
685+
686+
cdef shared_ptr[CDate64Builder] unwrap(self):
687+
return self.builder
688+
689+
cdef class Date32Builder(_ArrayBuilderBase):
690+
cdef:
691+
shared_ptr[CDate32Builder] builder
692+
DataType dtype
693+
694+
def __cinit__(self, MemoryPool memory_pool=None):
695+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
696+
self.builder.reset(new CDate32Builder(pool))
697+
self.type_marker = ARROW_TYPE_DATE32
698+
699+
cdef append_raw(self, int64_t value):
700+
# Convert from milliseconds to days (1000*60*60*24)
701+
cdef int32_t seconds_val = value // 86400000
702+
self.builder.get().Append(seconds_val)
703+
704+
cpdef append(self, value):
705+
self.builder.get().Append(value)
706+
707+
cpdef append_null(self):
708+
self.builder.get().AppendNull()
709+
710+
def __len__(self):
711+
return self.builder.get().length()
712+
713+
@property
714+
def unit(self):
715+
return self.dtype
716+
717+
cpdef finish(self):
718+
cdef shared_ptr[CArray] out
719+
with nogil:
720+
self.builder.get().Finish(&out)
721+
return pyarrow_wrap_array(out)
722+
723+
cdef shared_ptr[CDate32Builder] unwrap(self):
724+
return self.builder
725+
629726

630727
cdef class BoolBuilder(_ArrayBuilderBase):
631728
cdef:

bindings/python/pymongoarrow/types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ class _BsonArrowTypes(enum.Enum):
5353
array = 10
5454
binary = 11
5555
code = 12
56+
date32 = 13
57+
date64 = 14
5658

5759

5860
# Custom Extension Types.
@@ -266,6 +268,8 @@ def get_numpy_type(type):
266268
_atypes.is_boolean: _BsonArrowTypes.bool,
267269
_atypes.is_struct: _BsonArrowTypes.document,
268270
_atypes.is_list: _BsonArrowTypes.array,
271+
_atypes.is_date32: _BsonArrowTypes.date32,
272+
_atypes.is_date64: _BsonArrowTypes.date64,
269273
_atypes.is_large_string: _BsonArrowTypes.string,
270274
_atypes.is_large_list: _BsonArrowTypes.array,
271275
}

bindings/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ archs = "x86_64 arm64"
9595
[tool.pytest.ini_options]
9696
minversion = "7"
9797
addopts = ["-ra", "--strict-config", "--strict-markers", "--durations=5", "--junitxml=xunit-results/TEST-results.xml"]
98-
testpaths = ["test"]
98+
testpaths = ["test", "test/pandas_types"]
9999
log_cli_level = "INFO"
100100
norecursedirs = ["test/*"]
101101
faulthandler_timeout = 1500

bindings/python/test/test_arrow.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import tempfile
1616
import unittest
1717
import unittest.mock as mock
18-
from datetime import datetime
18+
from datetime import date, datetime
1919
from test import client_context
2020
from test.utils import AllowListEventListener, NullsTestMixin
2121

@@ -28,6 +28,8 @@
2828
Table,
2929
bool_,
3030
csv,
31+
date32,
32+
date64,
3133
decimal256,
3234
field,
3335
int32,
@@ -316,6 +318,21 @@ def test_write_schema_validation(self):
316318
with self.assertRaises(ValueError):
317319
self.round_trip(data, Schema(schema))
318320

321+
def test_date_types(self):
322+
schema, data = self._create_data()
323+
self.round_trip(data, Schema(schema))
324+
325+
schema = {"_id": int32(), "date32": date32(), "date64": date64()}
326+
data = Table.from_pydict(
327+
{
328+
"_id": [i for i in range(2)],
329+
"date32": [date(2012, 1, 1) for _ in range(2)],
330+
"date64": [datetime(2012, 1, 1) for _ in range(2)],
331+
},
332+
ArrowSchema(schema),
333+
)
334+
self.round_trip(data, Schema(schema))
335+
319336
@mock.patch.object(Collection, "insert_many", side_effect=Collection.insert_many, autospec=True)
320337
def test_write_batching(self, mock):
321338
schema = {
@@ -349,6 +366,8 @@ def _create_nested_data(self, nested_elem=None):
349366
"ObjectId": [ObjectId().binary for i in range(3)],
350367
"Decimal128": [Decimal128(str(i)).bid for i in range(3)],
351368
"Code": [str(i) for i in range(3)],
369+
"date32": [date(2012, 1, 1) for i in range(3)],
370+
"date64": [date(2012, 1, 1) for i in range(3)],
352371
}
353372

354373
def inner(i):
@@ -363,6 +382,8 @@ def inner(i):
363382
Binary=Binary(bytes(i), 10),
364383
ObjectId=ObjectId().binary,
365384
Code=str(i),
385+
date32=date(2012, 1, 1),
386+
date64=date(2014, 1, 1),
366387
)
367388
if nested_elem:
368389
inner_dict["list"] = [nested_elem]

bindings/python/test/test_builders.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import calendar
15-
from datetime import datetime, timedelta
15+
from datetime import date, datetime, timedelta
1616
from unittest import TestCase
1717

1818
from bson import Binary, Code, Decimal128, ObjectId
@@ -22,6 +22,8 @@
2222
BinaryBuilder,
2323
BoolBuilder,
2424
CodeBuilder,
25+
Date32Builder,
26+
Date64Builder,
2527
DatetimeBuilder,
2628
Decimal128Builder,
2729
DocumentBuilder,
@@ -62,7 +64,7 @@ def setUp(self):
6264
self.data_type = int64()
6365

6466

65-
class TestDate64Builder(TestCase):
67+
class TestDatetimeBuilder(TestCase):
6668
def test_default_unit(self):
6769
# Check default unit
6870
builder = DatetimeBuilder()
@@ -281,3 +283,40 @@ def test_simple(self):
281283
self.assertEqual(arr.null_count, 1)
282284
self.assertEqual(len(arr), 5)
283285
self.assertEqual(arr.to_pylist(), codes + [None])
286+
287+
288+
class TestDate32Builder(TestCase):
289+
def test_simple(self):
290+
epoch = date(1970, 1, 1)
291+
values = [date(2012, 1, 1), date(2012, 1, 2), date(2014, 4, 5)]
292+
builder = Date32Builder()
293+
builder.append(values[0].toordinal() - epoch.toordinal())
294+
builder.append_values([v.toordinal() - epoch.toordinal() for v in values[1:]])
295+
builder.append_null()
296+
arr = builder.finish()
297+
298+
self.assertIsInstance(arr, Array)
299+
self.assertEqual(arr.null_count, 1)
300+
self.assertEqual(len(arr), 4)
301+
self.assertEqual(arr.to_pylist(), values + [None])
302+
303+
304+
class TestDate64Builder(TestCase):
305+
def test_simple(self):
306+
def msec_since_epoch(d):
307+
epoch = datetime(1970, 1, 1)
308+
d = datetime.fromordinal(d.toordinal())
309+
diff = d - epoch
310+
return diff.total_seconds() * 1000
311+
312+
values = [date(2012, 1, 1), date(2012, 1, 2), date(2014, 4, 5)]
313+
builder = Date64Builder()
314+
builder.append(msec_since_epoch(values[0]))
315+
builder.append_values([msec_since_epoch(v) for v in values[1:]])
316+
builder.append_null()
317+
arr = builder.finish()
318+
319+
self.assertIsInstance(arr, Array)
320+
self.assertEqual(arr.null_count, 1)
321+
self.assertEqual(len(arr), 4)
322+
self.assertEqual(arr.to_pylist(), values + [None])

0 commit comments

Comments
 (0)