-
-
Notifications
You must be signed in to change notification settings - Fork 29
ENH: improve support for datetime columns #486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 62 commits
aaf8818
3e463a1
afdd0c1
c18ab22
597855f
fa4b86e
0e41ae4
1378ace
0f1ab27
6f78c68
336d0d8
3035a11
9efdc09
eb80e08
d50b2d0
47aa298
1efa5bf
0032839
9d2bfce
ca9a8ae
deb862c
e35c356
593b282
35d8d87
41c9da6
f53af87
a8c85b7
40ca1a5
fc53d44
458d75b
8a38961
9c764eb
e0273b5
91027d1
46f7847
8f7d853
9398ae4
3d3e3da
097d85a
41be45c
4f27049
cf9eba9
86239e2
7d28e3d
201c01e
4923178
f0e7409
8117e96
538b95f
f23bd38
b55cc2f
616a144
93dbc6e
6af6d63
066ec42
5c9efa1
b421a06
4486c9e
a3a0393
69dcc6a
7248d1b
86529a2
8303c05
ac5f20f
23bf348
2125547
aab9240
d3a2ae8
02036bb
bc1766e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
import os | ||
import warnings | ||
from datetime import datetime | ||
|
||
import numpy as np | ||
|
||
|
@@ -12,6 +13,7 @@ | |
PANDAS_GE_22, | ||
PANDAS_GE_30, | ||
PYARROW_GE_19, | ||
__gdal_version__, | ||
) | ||
from pyogrio.errors import DataSourceError | ||
from pyogrio.raw import ( | ||
|
@@ -37,33 +39,56 @@ def _stringify_path(path): | |
return path | ||
|
||
|
||
def _try_parse_datetime(ser): | ||
def _try_parse_datetime(ser, datetimes="UTC"): | ||
import pandas as pd # only called when pandas is known to be installed | ||
from pandas.api.types import is_string_dtype | ||
|
||
datetimes = datetimes.upper() | ||
datetime_kwargs = {} | ||
if datetimes == "STRING": | ||
if not is_string_dtype(ser.dtype): | ||
res = ser.astype("string").str.replace(" ", "T") | ||
theroggy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return res | ||
if __gdal_version__ < (3, 7, 0): | ||
# GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that | ||
return ser.str.replace(" ", "T").str.replace("/", "-") | ||
return ser | ||
elif datetimes == "UTC": | ||
datetime_kwargs["utc"] = True | ||
theroggy marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
elif datetimes == "DATETIME": | ||
datetime_kwargs["utc"] = False | ||
else: | ||
raise ValueError( | ||
f"Invalid value for 'datetimes': {datetimes!r}. " | ||
"Must be 'UTC' or 'DATETIME'." | ||
theroggy marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
) | ||
|
||
if PANDAS_GE_22: | ||
datetime_kwargs = {"format": "ISO8601"} | ||
datetime_kwargs["format"] = "ISO8601" | ||
elif PANDAS_GE_20: | ||
datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} | ||
datetime_kwargs["format"] = "ISO8601" | ||
datetime_kwargs["errors"] = "ignore" | ||
else: | ||
datetime_kwargs = {"yearfirst": True} | ||
datetime_kwargs["yearfirst"] = True | ||
|
||
with warnings.catch_warnings(): | ||
warnings.filterwarnings( | ||
"ignore", | ||
".*parsing datetimes with mixed time zones will raise.*", | ||
FutureWarning, | ||
) | ||
# pre-emptive try catch for when pandas will raise | ||
# (can tighten the exception type in future when it does) | ||
|
||
res = ser | ||
try: | ||
res = pd.to_datetime(ser, **datetime_kwargs) | ||
except Exception: | ||
res = ser | ||
# if object dtype, try parse as utc instead | ||
if res.dtype in ("object", "string"): | ||
try: | ||
res = pd.to_datetime(ser, utc=True, **datetime_kwargs) | ||
except Exception: | ||
pass | ||
except ValueError as ex: | ||
if "Mixed timezones detected" in str(ex): | ||
# Parsing mixed timezones with to_datetime is not supported anymore in | ||
# pandas>=3.0, so convert to pd.Timestamp objects manually. | ||
# Using 2 times map seems to be the fastest way to do this. | ||
res = ser.map(datetime.fromisoformat, na_action="ignore").map( | ||
pd.Timestamp, na_action="ignore" | ||
) | ||
|
||
if res.dtype.kind == "M": # any datetime64 | ||
# GDAL only supports ms precision, convert outputs to match. | ||
|
@@ -73,6 +98,7 @@ def _try_parse_datetime(ser): | |
res = res.dt.as_unit("ms") | ||
else: | ||
res = res.dt.round(freq="ms") | ||
|
||
return res | ||
|
||
|
||
|
@@ -96,6 +122,7 @@ def read_dataframe( | |
use_arrow=None, | ||
on_invalid="raise", | ||
arrow_to_pandas_kwargs=None, | ||
datetimes="UTC", | ||
**kwargs, | ||
): | ||
"""Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame. | ||
|
@@ -223,8 +250,22 @@ def read_dataframe( | |
arrow_to_pandas_kwargs : dict, optional (default: None) | ||
When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_ | ||
call for the arrow to pandas conversion. | ||
datetimes : str, optional (default: "UTC") | ||
The way datetime columns should be returned. Possible values: | ||
|
||
- **"UTC"**: return all datetime columns as pandas datetime64 columns | ||
converted to UTC. Naive datetimes (without timezone information) will | ||
be assumed to be in UTC timezone. | ||
- **"DATETIME"**: return datetimes in the timezone as they were read | ||
from the data source. Columns with values in a single timezone or | ||
without timezone information will be returned as pandas datetime64 | ||
columns. Columns with mixed timezone data are returned as object | ||
|
||
columns with pandas.Timestamp values. If you want to roundtrip | ||
datetimes as good as possible, use this option. | ||
- **"STRING"**: return all datetimes as ISO8601 strings. | ||
|
||
**kwargs | ||
Additional driver-specific dataset open options passed to OGR. Invalid | ||
Additional driver-specific dataset open options passed to OGR. Invalid | ||
options will trigger a warning. | ||
|
||
Returns | ||
|
@@ -267,11 +308,10 @@ def read_dataframe( | |
|
||
read_func = read_arrow if use_arrow else read | ||
gdal_force_2d = False if use_arrow else force_2d | ||
if not use_arrow: | ||
# For arrow, datetimes are read as is. | ||
# For numpy IO, datetimes are read as string values to preserve timezone info | ||
# as numpy does not directly support timezones. | ||
kwargs["datetime_as_string"] = True | ||
|
||
# Always read datetimes as string values to preserve (mixed) timezone info | ||
# as numpy does not directly support timezones and arrow datetime columns | ||
# don't support mixed timezones. | ||
|
||
result = read_func( | ||
path_or_buffer, | ||
layer=layer, | ||
|
@@ -288,6 +328,7 @@ def read_dataframe( | |
sql=sql, | ||
sql_dialect=sql_dialect, | ||
return_fids=fid_as_index, | ||
datetime_as_string=True, | ||
**kwargs, | ||
) | ||
|
||
|
@@ -330,6 +371,11 @@ def read_dataframe( | |
|
||
del table | ||
|
||
# convert datetime columns that were read as string to datetime | ||
for dtype, column in zip(meta["dtypes"], meta["fields"]): | ||
if dtype is not None and dtype.startswith("datetime"): | ||
df[column] = _try_parse_datetime(df[column], datetimes=datetimes) | ||
|
||
if fid_as_index: | ||
df = df.set_index(meta["fid_column"]) | ||
df.index.names = ["fid"] | ||
|
@@ -361,7 +407,7 @@ def read_dataframe( | |
df = pd.DataFrame(data, columns=columns, index=index) | ||
for dtype, c in zip(meta["dtypes"], df.columns): | ||
if dtype.startswith("datetime"): | ||
df[c] = _try_parse_datetime(df[c]) | ||
df[c] = _try_parse_datetime(df[c], datetimes=datetimes) | ||
|
||
if geometry is None or not read_geometry: | ||
return df | ||
|
@@ -584,6 +630,7 @@ def write_dataframe( | |
crs = geometry.crs.to_wkt("WKT1_GDAL") | ||
|
||
if use_arrow: | ||
import pandas as pd # only called when pandas is known to be installed | ||
import pyarrow as pa | ||
|
||
from pyogrio.raw import write_arrow | ||
|
@@ -619,8 +666,33 @@ def write_dataframe( | |
df = pd.DataFrame(df, copy=False) | ||
df[geometry_column] = geometry | ||
|
||
# Convert all datetime columns to isoformat strings, to avoid mixed timezone | ||
# information getting lost. | ||
datetime_cols = [] | ||
for name, dtype in df.dtypes.items(): | ||
col = df[name] | ||
if dtype == "object": | ||
# If first non-NA value is a datetime-like object, treat as datetime | ||
# column. | ||
first_non_na_index = col.first_valid_index() | ||
if first_non_na_index is not None: | ||
if isinstance(col[first_non_na_index], (pd.Timestamp, datetime)): | ||
df[name] = col.astype("string") | ||
datetime_cols.append(name) | ||
theroggy marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": | ||
# When it is a datetime column with a timezone different than UTC, it | ||
# needs to be converted to string, otherwise the timezone info is lost. | ||
df[name] = col.astype("string") | ||
datetime_cols.append(name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why GDAL preserves the tz-awareness for UTC, but not for other offsets (even though the values written to the file are in UTC) in the Arrow write path? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't test it again explicitly, but this follows the general logic that for most timezones daylight saving time is used, leading to (potential) different offsets in a column... and with different offsets in one column the timezone information gets lost... I updates the inline comment to clarify this. |
||
|
||
table = pa.Table.from_pandas(df, preserve_index=False) | ||
|
||
# Add metadata to datetime columns so GDAL knows they are datetimes. | ||
for datetime_col in datetime_cols: | ||
table = _add_column_metadata( | ||
table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}} | ||
) | ||
|
||
# Null arrow columns are not supported by GDAL, so convert to string | ||
for field_index, field in enumerate(table.schema): | ||
if field.type == pa.null(): | ||
|
@@ -678,6 +750,8 @@ def write_dataframe( | |
gdal_tz_offsets = {} | ||
for name in fields: | ||
col = df[name] | ||
values = None | ||
|
||
if isinstance(col.dtype, pd.DatetimeTZDtype): | ||
# Deal with datetimes with timezones by passing down timezone separately | ||
# pass down naive datetime | ||
|
@@ -692,8 +766,24 @@ def write_dataframe( | |
# Convert each row offset to a signed multiple of 15m and add to GMT value | ||
gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 | ||
gdal_tz_offsets[name] = gdal_offset_representation.values | ||
else: | ||
|
||
elif col.dtype == "object": | ||
# Column of Timestamp/datetime objects, split in naive datetime and tz. | ||
col_na = df[col.notna()][name] | ||
if len(col_na) and all( | ||
isinstance(x, (pd.Timestamp, datetime)) for x in col_na | ||
): | ||
tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) | ||
gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 | ||
gdal_tz_offsets[name] = gdal_offset_repr.values | ||
naive = col.apply( | ||
lambda x: None if pd.isna(x) else x.replace(tzinfo=None) | ||
) | ||
values = naive.values | ||
|
||
if values is None: | ||
values = col.values | ||
|
||
if isinstance(values, pd.api.extensions.ExtensionArray): | ||
from pandas.arrays import BooleanArray, FloatingArray, IntegerArray | ||
|
||
|
@@ -729,3 +819,48 @@ def write_dataframe( | |
gdal_tz_offsets=gdal_tz_offsets, | ||
**kwargs, | ||
) | ||
|
||
|
||
def _add_column_metadata(table, column_metadata: dict = {}): | ||
"""Add or update column-level metadata to an arrow table. | ||
|
||
Parameters | ||
---------- | ||
table : pyarrow.Table | ||
The table to add the column metadata to. | ||
column_metadata : dict | ||
A dictionary with column metadata in the form | ||
{ | ||
"column_1": {"some": "data"}, | ||
"column_2": {"more": "stuff"}, | ||
} | ||
|
||
Returns | ||
------- | ||
pyarrow.Table: table with the updated column metadata. | ||
""" | ||
import pyarrow as pa | ||
|
||
if not column_metadata: | ||
return table | ||
|
||
# Create updated column fields with new metadata | ||
fields = [] | ||
for col in table.schema.names: | ||
if col in column_metadata: | ||
# Add/update column metadata | ||
metadata = table.field(col).metadata or {} | ||
for key, value in column_metadata[col].items(): | ||
metadata[key] = value | ||
# Update field with updated metadata | ||
fields.append(table.field(col).with_metadata(metadata)) | ||
else: | ||
fields.append(table.field(col)) | ||
|
||
# Create new schema with the updated field metadata | ||
schema = pa.schema(fields, metadata=table.schema.metadata) | ||
|
||
# Build new table with updated schema (shouldn't copy data) | ||
table = table.cast(schema) | ||
|
||
return table |
Uh oh!
There was an error while loading. Please reload this page.