geopandas · theroggy · Oct 17, 2024 · Oct 18, 2024 · Jan 16, 2025 · Jan 17, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,12 @@
 # CHANGELOG
 
+## 0.12.0 (????-??-??)
+
+### Improvements
+
+-   Add `datetimes` parameter to `read_dateframe` to choose the way datetime columns are
+    returned + several fixes when reading and writing datetimes (#486).
+
 ## 0.11.1 (2025-08-02)
 
 ### Bug fixes

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -936,10 +936,16 @@ cdef process_fields(
 
             if datetime_as_string:
                 # defer datetime parsing to user/ pandas layer
-                # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only
-                data[i] = get_string(
-                    OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding
-                )
+                IF CTE_GDAL_VERSION >= (3, 7, 0):
+                    data[i] = get_string(
+                        OGR_F_GetFieldAsISO8601DateTime(ogr_feature, field_index, NULL),
+                        encoding=encoding,
+                    )
+                ELSE:
+                    data[i] = get_string(
+                        OGR_F_GetFieldAsString(ogr_feature, field_index),
+                        encoding=encoding,
+                    )
             else:
                 success = OGR_F_GetFieldAsDateTimeEx(
                     ogr_feature,
@@ -1502,6 +1508,7 @@ def ogr_open_arrow(
     int return_fids=False,
     int batch_size=0,
     use_pyarrow=False,
+    datetime_as_string=False,
 ):
 
     cdef int err = 0
@@ -1722,6 +1729,12 @@ def ogr_open_arrow(
                 "GEOARROW".encode("UTF-8")
             )
 
+        # Read DateTime fields as strings, as the Arrow DateTime column type is
+        # quite limited regarding support for mixed timezones,...
+        IF CTE_GDAL_VERSION >= (3, 11, 0):
+            if datetime_as_string:
+                options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES")
+
         # make sure layer is read from beginning
         OGR_L_ResetReading(ogr_layer)
 
@@ -1749,6 +1762,7 @@ def ogr_open_arrow(
             "crs": crs,
             "encoding": encoding,
             "fields": fields[:, 2],
+            "dtypes": fields[:, 3],
             "geometry_type": geometry_type,
             "geometry_name": geometry_name,
             "fid_column": fid_column,

diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
@@ -415,6 +415,14 @@ IF CTE_GDAL_VERSION >= (3, 6, 0):
         )
 
 
+IF CTE_GDAL_VERSION >= (3, 7, 0):
+
+    cdef extern from "ogr_api.h":
+        const char* OGR_F_GetFieldAsISO8601DateTime(
+            OGRFeatureH feature, int n, char** papszOptions
+        )
+
+
 IF CTE_GDAL_VERSION >= (3, 8, 0):
 
     cdef extern from "ogr_api.h":

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -2,6 +2,7 @@
 
 import os
 import warnings
+from datetime import datetime
 
 import numpy as np
 
@@ -12,6 +13,7 @@
     PANDAS_GE_22,
     PANDAS_GE_30,
     PYARROW_GE_19,
+    __gdal_version__,
 )
 from pyogrio.errors import DataSourceError
 from pyogrio.raw import (
@@ -37,33 +39,56 @@ def _stringify_path(path):
     return path
 
 
-def _try_parse_datetime(ser):
+def _try_parse_datetime(ser, datetimes="UTC"):
     import pandas as pd  # only called when pandas is known to be installed
+    from pandas.api.types import is_string_dtype
+
+    datetimes = datetimes.upper()
+    datetime_kwargs = {}
+    if datetimes == "STRING":
+        if not is_string_dtype(ser.dtype):
+            res = ser.astype("string").str.replace(" ", "T")
+            return res
+        if __gdal_version__ < (3, 7, 0):
+            # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that
+            return ser.str.replace(" ", "T").str.replace("/", "-")
+        return ser
+    elif datetimes == "UTC":
+        datetime_kwargs["utc"] = True
+    elif datetimes == "DATETIME":
+        datetime_kwargs["utc"] = False
+    else:
+        raise ValueError(
+            f"Invalid value for 'datetimes': {datetimes!r}. "
+            "Must be 'UTC' or 'DATETIME'."
+        )
 
     if PANDAS_GE_22:
-        datetime_kwargs = {"format": "ISO8601"}
+        datetime_kwargs["format"] = "ISO8601"
     elif PANDAS_GE_20:
-        datetime_kwargs = {"format": "ISO8601", "errors": "ignore"}
+        datetime_kwargs["format"] = "ISO8601"
+        datetime_kwargs["errors"] = "ignore"
     else:
-        datetime_kwargs = {"yearfirst": True}
+        datetime_kwargs["yearfirst"] = True
+
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
             ".*parsing datetimes with mixed time zones will raise.*",
             FutureWarning,
         )
-        # pre-emptive try catch for when pandas will raise
-        # (can tighten the exception type in future when it does)
+
+        res = ser
         try:
             res = pd.to_datetime(ser, **datetime_kwargs)
-        except Exception:
-            res = ser
-    # if object dtype, try parse as utc instead
-    if res.dtype in ("object", "string"):
-        try:
-            res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
-        except Exception:
-            pass
+        except ValueError as ex:
+            if "Mixed timezones detected" in str(ex):
+                # Parsing mixed timezones with to_datetime is not supported anymore in
+                # pandas>=3.0, so convert to pd.Timestamp objects manually.
+                # Using 2 times map seems to be the fastest way to do this.
+                res = ser.map(datetime.fromisoformat, na_action="ignore").map(
+                    pd.Timestamp, na_action="ignore"
+                )
 
     if res.dtype.kind == "M":  # any datetime64
         # GDAL only supports ms precision, convert outputs to match.
@@ -73,6 +98,7 @@ def _try_parse_datetime(ser):
             res = res.dt.as_unit("ms")
         else:
             res = res.dt.round(freq="ms")
+
     return res
 
 
@@ -96,6 +122,7 @@ def read_dataframe(
     use_arrow=None,
     on_invalid="raise",
     arrow_to_pandas_kwargs=None,
+    datetimes="UTC",
     **kwargs,
 ):
     """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
@@ -223,8 +250,22 @@ def read_dataframe(
     arrow_to_pandas_kwargs : dict, optional (default: None)
         When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_
         call for the arrow to pandas conversion.
+    datetimes : str, optional (default: "UTC")
+        The way datetime columns should be returned. Possible values:
+
+        - **"UTC"**: return all datetime columns as pandas datetime64 columns
+          converted to UTC. Naive datetimes (without timezone information) will
+          be assumed to be in UTC timezone.
+        - **"DATETIME"**: return datetimes in the timezone as they were read
+          from the data source. Columns with values in a single timezone or
+          without timezone information will be returned as pandas datetime64
+          columns. Columns with mixed timezone data are returned as object
+          columns with pandas.Timestamp values. If you want to roundtrip
+          datetimes as good as possible, use this option.
+        - **"STRING"**: return all datetimes as ISO8601 strings.
+
     **kwargs
-        Additional driver-specific dataset open options passed to OGR.  Invalid
+        Additional driver-specific dataset open options passed to OGR. Invalid
         options will trigger a warning.
 
     Returns
@@ -267,11 +308,10 @@ def read_dataframe(
 
     read_func = read_arrow if use_arrow else read
     gdal_force_2d = False if use_arrow else force_2d
-    if not use_arrow:
-        # For arrow, datetimes are read as is.
-        # For numpy IO, datetimes are read as string values to preserve timezone info
-        # as numpy does not directly support timezones.
-        kwargs["datetime_as_string"] = True
+
+    # Always read datetimes as string values to preserve (mixed) timezone info
+    # as numpy does not directly support timezones and arrow datetime columns
+    # don't support mixed timezones.
     result = read_func(
         path_or_buffer,
         layer=layer,
@@ -288,6 +328,7 @@ def read_dataframe(
         sql=sql,
         sql_dialect=sql_dialect,
         return_fids=fid_as_index,
+        datetime_as_string=True,
         **kwargs,
     )
 
@@ -330,6 +371,11 @@ def read_dataframe(
 
         del table
 
+        # convert datetime columns that were read as string to datetime
+        for dtype, column in zip(meta["dtypes"], meta["fields"]):
+            if dtype is not None and dtype.startswith("datetime"):
+                df[column] = _try_parse_datetime(df[column], datetimes=datetimes)
+
         if fid_as_index:
             df = df.set_index(meta["fid_column"])
             df.index.names = ["fid"]
@@ -361,7 +407,7 @@ def read_dataframe(
     df = pd.DataFrame(data, columns=columns, index=index)
     for dtype, c in zip(meta["dtypes"], df.columns):
         if dtype.startswith("datetime"):
-            df[c] = _try_parse_datetime(df[c])
+            df[c] = _try_parse_datetime(df[c], datetimes=datetimes)
 
     if geometry is None or not read_geometry:
         return df
@@ -584,6 +630,7 @@ def write_dataframe(
             crs = geometry.crs.to_wkt("WKT1_GDAL")
 
     if use_arrow:
+        import pandas as pd  # only called when pandas is known to be installed
         import pyarrow as pa
 
         from pyogrio.raw import write_arrow
@@ -619,8 +666,33 @@ def write_dataframe(
             df = pd.DataFrame(df, copy=False)
             df[geometry_column] = geometry
 
+        # Convert all datetime columns to isoformat strings, to avoid mixed timezone
+        # information getting lost.
+        datetime_cols = []
+        for name, dtype in df.dtypes.items():
+            col = df[name]
+            if dtype == "object":
+                # If first non-NA value is a datetime-like object, treat as datetime
+                # column.
+                first_non_na_index = col.first_valid_index()
+                if first_non_na_index is not None:
+                    if isinstance(col[first_non_na_index], (pd.Timestamp, datetime)):
+                        df[name] = col.astype("string")
+                        datetime_cols.append(name)
+            elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC":
+                # When it is a datetime column with a timezone different than UTC, it
+                # needs to be converted to string, otherwise the timezone info is lost.
+                df[name] = col.astype("string")
+                datetime_cols.append(name)
+
         table = pa.Table.from_pandas(df, preserve_index=False)
 
+        # Add metadata to datetime columns so GDAL knows they are datetimes.
+        for datetime_col in datetime_cols:
+            table = _add_column_metadata(
+                table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}}
+            )
+
         # Null arrow columns are not supported by GDAL, so convert to string
         for field_index, field in enumerate(table.schema):
             if field.type == pa.null():
@@ -678,6 +750,8 @@ def write_dataframe(
     gdal_tz_offsets = {}
     for name in fields:
         col = df[name]
+        values = None
+
         if isinstance(col.dtype, pd.DatetimeTZDtype):
             # Deal with datetimes with timezones by passing down timezone separately
             # pass down naive datetime
@@ -692,8 +766,24 @@ def write_dataframe(
             # Convert each row offset to a signed multiple of 15m and add to GMT value
             gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
             gdal_tz_offsets[name] = gdal_offset_representation.values
-        else:
+
+        elif col.dtype == "object":
+            # Column of Timestamp/datetime objects, split in naive datetime and tz.
+            col_na = df[col.notna()][name]
+            if len(col_na) and all(
+                isinstance(x, (pd.Timestamp, datetime)) for x in col_na
+            ):
+                tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset())
+                gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100
+                gdal_tz_offsets[name] = gdal_offset_repr.values
+                naive = col.apply(
+                    lambda x: None if pd.isna(x) else x.replace(tzinfo=None)
+                )
+                values = naive.values
+
+        if values is None:
             values = col.values
+
         if isinstance(values, pd.api.extensions.ExtensionArray):
             from pandas.arrays import BooleanArray, FloatingArray, IntegerArray
 
@@ -729,3 +819,48 @@ def write_dataframe(
         gdal_tz_offsets=gdal_tz_offsets,
         **kwargs,
     )
+
+
+def _add_column_metadata(table, column_metadata: dict = {}):
+    """Add or update column-level metadata to an arrow table.
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+        The table to add the column metadata to.
+    column_metadata : dict
+        A dictionary with column metadata in the form
+            {
+                "column_1": {"some": "data"},
+                "column_2": {"more": "stuff"},
+            }
+
+    Returns
+    -------
+    pyarrow.Table: table with the updated column metadata.
+    """
+    import pyarrow as pa
+
+    if not column_metadata:
+        return table
+
+    # Create updated column fields with new metadata
+    fields = []
+    for col in table.schema.names:
+        if col in column_metadata:
+            # Add/update column metadata
+            metadata = table.field(col).metadata or {}
+            for key, value in column_metadata[col].items():
+                metadata[key] = value
+            # Update field with updated metadata
+            fields.append(table.field(col).with_metadata(metadata))
+        else:
+            fields.append(table.field(col))
+
+    # Create new schema with the updated field metadata
+    schema = pa.schema(fields, metadata=table.schema.metadata)
+
+    # Build new table with updated schema (shouldn't copy data)
+    table = table.cast(schema)
+
+    return table