Skip to content
Open
Show file tree
Hide file tree
Changes from 62 commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
aaf8818
ENH: deal properly with naive datetimes with arrow
theroggy Oct 17, 2024
3e463a1
Add more testcases, also for tz datetimes
theroggy Oct 18, 2024
afdd0c1
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 16, 2025
c18ab22
Use datetime_as_string for reading with arrow
theroggy Jan 17, 2025
597855f
Update _io.pyx
theroggy Jan 17, 2025
fa4b86e
Skip tests where appropriate
theroggy Jan 17, 2025
0e41ae4
Improve support for mixed and naive datetimes
theroggy Jan 17, 2025
1378ace
Skip use_arrow tests with old gdal versions
theroggy Jan 17, 2025
0f1ab27
Take in account pandas version
theroggy Jan 17, 2025
6f78c68
Update test_geopandas_io.py
theroggy Jan 17, 2025
336d0d8
Also support columns with datetime objects
theroggy Jan 18, 2025
3035a11
Rename some test functions for consistency
theroggy Jan 18, 2025
9efdc09
Avoid warning in test
theroggy Jan 18, 2025
eb80e08
Improve inline comment
theroggy Jan 18, 2025
d50b2d0
Update CHANGES.md
theroggy Jan 18, 2025
47aa298
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 19, 2025
1efa5bf
Symplify code
theroggy Jan 20, 2025
0032839
Don't cast UTC data to string when writing
theroggy Jan 20, 2025
9d2bfce
Various improvements to tests
theroggy Jan 20, 2025
ca9a8ae
Smal fixes to tests
theroggy Jan 20, 2025
deb862c
Xfail some tests where needed
theroggy Jan 20, 2025
e35c356
Make UTC assert more specific
theroggy Jan 22, 2025
593b282
Revert "Make UTC assert more specific"
theroggy Jan 22, 2025
35d8d87
Update test_geopandas_io.py
theroggy Jan 22, 2025
41c9da6
Use astype("string") instead of apply
theroggy Jan 23, 2025
f53af87
Improve tests
theroggy Jan 23, 2025
a8c85b7
Fix tests for older versions
theroggy Jan 23, 2025
40ca1a5
Update test_geopandas_io.py
theroggy Jan 23, 2025
fc53d44
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Jan 30, 2025
458d75b
Merge
theroggy Apr 23, 2025
8a38961
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy May 13, 2025
9c764eb
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Aug 2, 2025
e0273b5
Add parameter to specify if dates need to be read as UTC or not
theroggy Aug 2, 2025
91027d1
Fix tests for old gdal versions
theroggy Aug 3, 2025
46f7847
Treat Obkect column as datetime if the first non-null value is a date…
theroggy Aug 3, 2025
8f7d853
Add support to return datetimes as string
theroggy Aug 3, 2025
9398ae4
Convert to string for older GDAL versions
theroggy Aug 3, 2025
3d3e3da
Fixes to tests for old gdal versions
theroggy Aug 3, 2025
097d85a
Fix tests for older GDAL versions
theroggy Aug 3, 2025
41be45c
Fixes in tests for old GDAL versions
theroggy Aug 3, 2025
4f27049
Fix tests for older gdal versions
theroggy Aug 3, 2025
cf9eba9
Fix tests
theroggy Aug 3, 2025
86239e2
Fix tests for gdal < 3.7
theroggy Aug 3, 2025
7d28e3d
Test fixes for gdal <3.7
theroggy Aug 3, 2025
201c01e
Fix linter error
theroggy Aug 3, 2025
4923178
Several textual improvements
theroggy Aug 3, 2025
f0e7409
Add test for dates long ago
theroggy Aug 3, 2025
8117e96
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Aug 3, 2025
538b95f
Improve docstring of test with dates from long ago
theroggy Aug 3, 2025
f23bd38
Fix long ago test for old versions of gdal
theroggy Aug 3, 2025
b55cc2f
xfail dates of long ago with arrow and gdal <3.11
theroggy Aug 3, 2025
616a144
Fix some errors with pandas 3
theroggy Aug 3, 2025
93dbc6e
Improve error
theroggy Aug 4, 2025
6af6d63
Support for pandas 3.0
theroggy Aug 4, 2025
066ec42
Update geopandas.py
theroggy Aug 4, 2025
5c9efa1
Support pandas 3.0
theroggy Aug 4, 2025
b421a06
Support pandas 3.0
theroggy Aug 4, 2025
4486c9e
Update test_geopandas_io.py
theroggy Aug 5, 2025
a3a0393
Update test_geopandas_io.py
theroggy Aug 5, 2025
69dcc6a
Merge branch 'ENH-deal-properly-with-naive-datetimes-with-arrow' of h…
theroggy Aug 5, 2025
7248d1b
Small textual improvements
theroggy Aug 5, 2025
86529a2
Typo in changelog
theroggy Aug 5, 2025
8303c05
Fix UTC mode so it is backwards compatible
theroggy Aug 7, 2025
ac5f20f
Fix for pandas 3.0
theroggy Aug 7, 2025
23bf348
Improve inline doc.
theroggy Aug 7, 2025
2125547
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Aug 27, 2025
aab9240
Rename datetimes options
theroggy Aug 27, 2025
d3a2ae8
Fix for pandas <2
theroggy Aug 27, 2025
02036bb
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Aug 31, 2025
bc1766e
Merge remote-tracking branch 'upstream/main' into ENH-deal-properly-w…
theroggy Aug 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# CHANGELOG

## 0.12.0 (????-??-??)

### Improvements

- Add `datetimes` parameter to `read_dateframe` to choose the way datetime columns are
returned + several fixes when reading and writing datetimes (#486).

## 0.11.1 (2025-08-02)

### Bug fixes
Expand Down
22 changes: 18 additions & 4 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -936,10 +936,16 @@ cdef process_fields(

if datetime_as_string:
# defer datetime parsing to user/ pandas layer
# Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only
data[i] = get_string(
OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding
)
IF CTE_GDAL_VERSION >= (3, 7, 0):
data[i] = get_string(
OGR_F_GetFieldAsISO8601DateTime(ogr_feature, field_index, NULL),
encoding=encoding,
)
ELSE:
data[i] = get_string(
OGR_F_GetFieldAsString(ogr_feature, field_index),
encoding=encoding,
)
else:
success = OGR_F_GetFieldAsDateTimeEx(
ogr_feature,
Expand Down Expand Up @@ -1502,6 +1508,7 @@ def ogr_open_arrow(
int return_fids=False,
int batch_size=0,
use_pyarrow=False,
datetime_as_string=False,
):

cdef int err = 0
Expand Down Expand Up @@ -1722,6 +1729,12 @@ def ogr_open_arrow(
"GEOARROW".encode("UTF-8")
)

# Read DateTime fields as strings, as the Arrow DateTime column type is
# quite limited regarding support for mixed timezones,...
IF CTE_GDAL_VERSION >= (3, 11, 0):
if datetime_as_string:
options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES")

# make sure layer is read from beginning
OGR_L_ResetReading(ogr_layer)

Expand Down Expand Up @@ -1749,6 +1762,7 @@ def ogr_open_arrow(
"crs": crs,
"encoding": encoding,
"fields": fields[:, 2],
"dtypes": fields[:, 3],
"geometry_type": geometry_type,
"geometry_name": geometry_name,
"fid_column": fid_column,
Expand Down
8 changes: 8 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,14 @@ IF CTE_GDAL_VERSION >= (3, 6, 0):
)


IF CTE_GDAL_VERSION >= (3, 7, 0):

cdef extern from "ogr_api.h":
const char* OGR_F_GetFieldAsISO8601DateTime(
OGRFeatureH feature, int n, char** papszOptions
)


IF CTE_GDAL_VERSION >= (3, 8, 0):

cdef extern from "ogr_api.h":
Expand Down
179 changes: 157 additions & 22 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import warnings
from datetime import datetime

import numpy as np

Expand All @@ -12,6 +13,7 @@
PANDAS_GE_22,
PANDAS_GE_30,
PYARROW_GE_19,
__gdal_version__,
)
from pyogrio.errors import DataSourceError
from pyogrio.raw import (
Expand All @@ -37,33 +39,56 @@ def _stringify_path(path):
return path


def _try_parse_datetime(ser):
def _try_parse_datetime(ser, datetimes="UTC"):
import pandas as pd # only called when pandas is known to be installed
from pandas.api.types import is_string_dtype

datetimes = datetimes.upper()
datetime_kwargs = {}
if datetimes == "STRING":
if not is_string_dtype(ser.dtype):
res = ser.astype("string").str.replace(" ", "T")
return res
if __gdal_version__ < (3, 7, 0):
# GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that
return ser.str.replace(" ", "T").str.replace("/", "-")
return ser
elif datetimes == "UTC":
datetime_kwargs["utc"] = True
elif datetimes == "DATETIME":
datetime_kwargs["utc"] = False
else:
raise ValueError(
f"Invalid value for 'datetimes': {datetimes!r}. "
"Must be 'UTC' or 'DATETIME'."
)

if PANDAS_GE_22:
datetime_kwargs = {"format": "ISO8601"}
datetime_kwargs["format"] = "ISO8601"
elif PANDAS_GE_20:
datetime_kwargs = {"format": "ISO8601", "errors": "ignore"}
datetime_kwargs["format"] = "ISO8601"
datetime_kwargs["errors"] = "ignore"
else:
datetime_kwargs = {"yearfirst": True}
datetime_kwargs["yearfirst"] = True

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
".*parsing datetimes with mixed time zones will raise.*",
FutureWarning,
)
# pre-emptive try catch for when pandas will raise
# (can tighten the exception type in future when it does)

res = ser
try:
res = pd.to_datetime(ser, **datetime_kwargs)
except Exception:
res = ser
# if object dtype, try parse as utc instead
if res.dtype in ("object", "string"):
try:
res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
except Exception:
pass
except ValueError as ex:
if "Mixed timezones detected" in str(ex):
# Parsing mixed timezones with to_datetime is not supported anymore in
# pandas>=3.0, so convert to pd.Timestamp objects manually.
# Using 2 times map seems to be the fastest way to do this.
res = ser.map(datetime.fromisoformat, na_action="ignore").map(
pd.Timestamp, na_action="ignore"
)

if res.dtype.kind == "M": # any datetime64
# GDAL only supports ms precision, convert outputs to match.
Expand All @@ -73,6 +98,7 @@ def _try_parse_datetime(ser):
res = res.dt.as_unit("ms")
else:
res = res.dt.round(freq="ms")

return res


Expand All @@ -96,6 +122,7 @@ def read_dataframe(
use_arrow=None,
on_invalid="raise",
arrow_to_pandas_kwargs=None,
datetimes="UTC",
**kwargs,
):
"""Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
Expand Down Expand Up @@ -223,8 +250,22 @@ def read_dataframe(
arrow_to_pandas_kwargs : dict, optional (default: None)
When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_
call for the arrow to pandas conversion.
datetimes : str, optional (default: "UTC")
The way datetime columns should be returned. Possible values:

- **"UTC"**: return all datetime columns as pandas datetime64 columns
converted to UTC. Naive datetimes (without timezone information) will
be assumed to be in UTC timezone.
- **"DATETIME"**: return datetimes in the timezone as they were read
from the data source. Columns with values in a single timezone or
without timezone information will be returned as pandas datetime64
columns. Columns with mixed timezone data are returned as object
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nitpick on the wording, but I think we should be clear about that it are "timezone-aware columns with mixed offsets", because the general case of mixed offsets (due to DST changes throughout the year) is still a single time zone.
The issue is that GDAL does not actually support the concept of time zones, only offsets.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, I tried to clarify this.

columns with pandas.Timestamp values. If you want to roundtrip
datetimes as good as possible, use this option.
- **"STRING"**: return all datetimes as ISO8601 strings.

**kwargs
Additional driver-specific dataset open options passed to OGR. Invalid
Additional driver-specific dataset open options passed to OGR. Invalid
options will trigger a warning.

Returns
Expand Down Expand Up @@ -267,11 +308,10 @@ def read_dataframe(

read_func = read_arrow if use_arrow else read
gdal_force_2d = False if use_arrow else force_2d
if not use_arrow:
# For arrow, datetimes are read as is.
# For numpy IO, datetimes are read as string values to preserve timezone info
# as numpy does not directly support timezones.
kwargs["datetime_as_string"] = True

# Always read datetimes as string values to preserve (mixed) timezone info
# as numpy does not directly support timezones and arrow datetime columns
# don't support mixed timezones.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know what the performance impact is of also setting datetime_as_string to True for the Arrow code path?
I would think that this is going to be quite a bit slower, and in that case we should maybe only set it if the user asked for getting datetime objects or strings, and not in the default case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In several situations this leads to wrong data being returned, e.g. timezones being dropped ,... Especially for .fgb files, sometimes for GPKG files.

I did a quick test with the 3.3 mio buildings in new zealand on my laptop, and the times fluctuate some, but during a more stable period reading them with datetime_as_string=True took 10.6 secs and with datetime_as_string=False it was around 11.4 seconds, so it is a bit faster, but not a huge difference.

result = read_func(
path_or_buffer,
layer=layer,
Expand All @@ -288,6 +328,7 @@ def read_dataframe(
sql=sql,
sql_dialect=sql_dialect,
return_fids=fid_as_index,
datetime_as_string=True,
**kwargs,
)

Expand Down Expand Up @@ -330,6 +371,11 @@ def read_dataframe(

del table

# convert datetime columns that were read as string to datetime
for dtype, column in zip(meta["dtypes"], meta["fields"]):
if dtype is not None and dtype.startswith("datetime"):
df[column] = _try_parse_datetime(df[column], datetimes=datetimes)

if fid_as_index:
df = df.set_index(meta["fid_column"])
df.index.names = ["fid"]
Expand Down Expand Up @@ -361,7 +407,7 @@ def read_dataframe(
df = pd.DataFrame(data, columns=columns, index=index)
for dtype, c in zip(meta["dtypes"], df.columns):
if dtype.startswith("datetime"):
df[c] = _try_parse_datetime(df[c])
df[c] = _try_parse_datetime(df[c], datetimes=datetimes)

if geometry is None or not read_geometry:
return df
Expand Down Expand Up @@ -584,6 +630,7 @@ def write_dataframe(
crs = geometry.crs.to_wkt("WKT1_GDAL")

if use_arrow:
import pandas as pd # only called when pandas is known to be installed
import pyarrow as pa

from pyogrio.raw import write_arrow
Expand Down Expand Up @@ -619,8 +666,33 @@ def write_dataframe(
df = pd.DataFrame(df, copy=False)
df[geometry_column] = geometry

# Convert all datetime columns to isoformat strings, to avoid mixed timezone
# information getting lost.
datetime_cols = []
for name, dtype in df.dtypes.items():
col = df[name]
if dtype == "object":
# If first non-NA value is a datetime-like object, treat as datetime
# column.
first_non_na_index = col.first_valid_index()
if first_non_na_index is not None:
if isinstance(col[first_non_na_index], (pd.Timestamp, datetime)):
df[name] = col.astype("string")
datetime_cols.append(name)
elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC":
# When it is a datetime column with a timezone different than UTC, it
# needs to be converted to string, otherwise the timezone info is lost.
df[name] = col.astype("string")
datetime_cols.append(name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know why GDAL preserves the tz-awareness for UTC, but not for other offsets (even though the values written to the file are in UTC) in the Arrow write path?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't test it again explicitly, but this follows the general logic that for most timezones daylight saving time is used, leading to (potential) different offsets in a column... and with different offsets in one column the timezone information gets lost... I updates the inline comment to clarify this.


table = pa.Table.from_pandas(df, preserve_index=False)

# Add metadata to datetime columns so GDAL knows they are datetimes.
for datetime_col in datetime_cols:
table = _add_column_metadata(
table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}}
)

# Null arrow columns are not supported by GDAL, so convert to string
for field_index, field in enumerate(table.schema):
if field.type == pa.null():
Expand Down Expand Up @@ -678,6 +750,8 @@ def write_dataframe(
gdal_tz_offsets = {}
for name in fields:
col = df[name]
values = None

if isinstance(col.dtype, pd.DatetimeTZDtype):
# Deal with datetimes with timezones by passing down timezone separately
# pass down naive datetime
Expand All @@ -692,8 +766,24 @@ def write_dataframe(
# Convert each row offset to a signed multiple of 15m and add to GMT value
gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
gdal_tz_offsets[name] = gdal_offset_representation.values
else:

elif col.dtype == "object":
# Column of Timestamp/datetime objects, split in naive datetime and tz.
col_na = df[col.notna()][name]
if len(col_na) and all(
isinstance(x, (pd.Timestamp, datetime)) for x in col_na
):
tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset())
gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100
gdal_tz_offsets[name] = gdal_offset_repr.values
naive = col.apply(
lambda x: None if pd.isna(x) else x.replace(tzinfo=None)
)
values = naive.values

if values is None:
values = col.values

if isinstance(values, pd.api.extensions.ExtensionArray):
from pandas.arrays import BooleanArray, FloatingArray, IntegerArray

Expand Down Expand Up @@ -729,3 +819,48 @@ def write_dataframe(
gdal_tz_offsets=gdal_tz_offsets,
**kwargs,
)


def _add_column_metadata(table, column_metadata: dict = {}):
"""Add or update column-level metadata to an arrow table.

Parameters
----------
table : pyarrow.Table
The table to add the column metadata to.
column_metadata : dict
A dictionary with column metadata in the form
{
"column_1": {"some": "data"},
"column_2": {"more": "stuff"},
}

Returns
-------
pyarrow.Table: table with the updated column metadata.
"""
import pyarrow as pa

if not column_metadata:
return table

# Create updated column fields with new metadata
fields = []
for col in table.schema.names:
if col in column_metadata:
# Add/update column metadata
metadata = table.field(col).metadata or {}
for key, value in column_metadata[col].items():
metadata[key] = value
# Update field with updated metadata
fields.append(table.field(col).with_metadata(metadata))
else:
fields.append(table.field(col))

# Create new schema with the updated field metadata
schema = pa.schema(fields, metadata=table.schema.metadata)

# Build new table with updated schema (shouldn't copy data)
table = table.cast(schema)

return table
Loading
Loading