Skip to content

Commit 87bed4a

Browse files
authored
ARROW-127 Time Zones are Not Properly Handled by Auto Schema (#106)
* ARROW-127 Time Zones are Not Properly Handled by Auto Schema * clean up handling of tz info * undo change to test
1 parent e393e75 commit 87bed4a

File tree

7 files changed

+17
-11
lines changed

7 files changed

+17
-11
lines changed

.github/workflows/test-python.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
run: |
8181
# Install the test deps
8282
LIBBSON_INSTALL_DIR=$(pwd)/libbson python -m pip install -vv -e ".[test]"
83-
LD_LIBRARY_PATH=$(pwd)/libbson/lib python -m unittest discover test -v
83+
PYTHONWARNINGS=error LD_LIBRARY_PATH=$(pwd)/libbson/lib python -m unittest discover test -v
8484
- name: Check the manifest
8585
run: |
8686
pip install check-manifest

bindings/python/pymongoarrow/context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
6565
:class:`~bson.codec_options.CodecOptions`.
6666
"""
6767
if schema is None:
68-
return cls(schema, {})
68+
return cls(schema, {}, codec_options)
6969

7070
builder_map = {}
7171
str_type_map = _get_internal_typemap(schema.typemap)

bindings/python/pymongoarrow/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def process_bson_stream(bson_stream, context):
111111

112112
builder_type = _builder_type_map[ftype]
113113
if builder_type == DatetimeBuilder and context.tzinfo is not None:
114-
arrow_type = timestamp(arrow_type.unit, tz=context.tzinfo)
114+
arrow_type = timestamp('ms', tz=context.tzinfo)
115115
builder_map[key] = builder_type(dtype=arrow_type)
116116
else:
117117
builder_map[key] = builder_type()

bindings/python/pymongoarrow/types.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,10 @@ def _is_decimal128_str(obj):
7575
Int64: lambda _: int64(),
7676
float: lambda _: float64(),
7777
int: lambda _: int64(),
78-
datetime: lambda _: timestamp("ms"), # TODO: add tzinfo support
78+
# Note: we cannot infer a timezone form a raw datetime class,
79+
# if a timezone is preferred then a timestamp with tz information
80+
# must be used directly.
81+
datetime: lambda _: timestamp("ms"),
7982
ObjectId: lambda _: ObjectIdType(),
8083
Decimal128: lambda _: Decimal128StringType(),
8184
str: lambda _: string(),
@@ -88,7 +91,7 @@ def _is_decimal128_str(obj):
8891
_atypes.is_int64: np.int64,
8992
_atypes.is_float64: np.float64,
9093
_atypes.is_timestamp: "datetime64[ms]",
91-
_is_objectid: np.object,
94+
_is_objectid: object,
9295
_atypes.is_string: np.str_,
9396
_atypes.is_boolean: np.bool_,
9497
}

bindings/python/test/test_arrow.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def test_auto_schema_tz(self):
373373
ArrowSchema(
374374
{
375375
"bool": bool_(),
376-
"dt": timestamp("ms"),
376+
"dt": timestamp("ms", "US/Eastern"),
377377
"string": string(),
378378
}
379379
),
@@ -388,6 +388,7 @@ def test_auto_schema_tz(self):
388388
self.coll.with_options(codec_options=codec_options),
389389
{} if func == find_arrow_all else [],
390390
).drop(["_id"])
391+
self.assertEqual(out["dt"].type.tz, "US/Eastern")
391392
self.assertEqual(data, out)
392393

393394

bindings/python/test/test_numpy.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,7 @@ def test_auto_schema_tz(self):
254254
data = {
255255
"string": [str(i) for i in range(3)],
256256
"bool": [True for _ in range(3)],
257-
"dt": [
258-
datetime.datetime(1970 + i, 1, 1, tzinfo=timezone("US/Eastern")) for i in range(3)
259-
],
257+
"dt": [datetime.datetime(1970 + i, 1, 1) for i in range(3)],
260258
}
261259
data = self.schemafied_ndarray_dict(data, schema)
262260
self.coll.drop()
@@ -270,6 +268,10 @@ def test_auto_schema_tz(self):
270268
{} if func == find_numpy_all else [],
271269
)
272270
del out["_id"]
271+
# Note: Numpy does not handle timezones.
272+
assert out["dt"].dtype == np.dtype("<M8[ns]")
273+
assert data["dt"].dtype == np.dtype("<M8[ms]")
274+
out.pop("dt")
273275
self.assert_numpy_equal(data, out)
274276

275277

bindings/python/test/test_pandas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_write_schema_validation(self):
146146
Schema(arrow_schema),
147147
)
148148

149-
schema = {"_id": "int32", "data": np.ubyte()}
149+
schema = {"_id": "int32", "data": bytes}
150150
data = pd.DataFrame(
151151
data={"_id": [i for i in range(2)], "data": [i for i in range(2)]}
152152
).astype(schema)
@@ -223,7 +223,7 @@ def test_auto_schema_heterogeneous(self):
223223
def test_auto_schema_tz(self):
224224
schema = {
225225
"bool": "bool",
226-
"dt": "datetime64[ns]",
226+
"dt": "datetime64[ns, US/Eastern]",
227227
"string": "str",
228228
}
229229
data = pd.DataFrame(

0 commit comments

Comments
 (0)