Add export() function that wraps writing to all supported formats (based on file extension) (#214)

JWCook · web-flow · commit e8ce31bb8508 · 2026-02-18T03:58:09.000Z
Plus fixes for various bugs uncovered by tests:
- Strip timezone info from all datetime values before saving to xlsx
(for compatibility with openpyxl)
- Fix issues with some observation formats passed to `to_json()`
- Update `to_geojson()` to use location tuple if available
diff --git a/HISTORY.md b/HISTORY.md
@@ -4,11 +4,15 @@
 
 - Add database migrations with Alembic and include in PyPI package
 - Add a `migrate()` helper for downstream libraries
+- Add a unified `export()` function to write observations to any supported format (based on file extension)
 - Extend `read()` to support GeoJSON, GPX, DwC, and SQLite formats
 - Improve CSV loading performance
 - Rewrite taxonomy aggregation for significantly improved performance
 - Optimize DwC-A table loading (delay index creation until after all rows are inserted)
 - Raise a more descriptive error when SQLAlchemy is missing
+- Strip timezone info from all datetime values before saving to xlsx (for compatibility with openpyxl)
+- Fix issues with some observation formats passed to `to_json()`
+- Update `to_geojson()` to use `location` tuple if available
 
 ## 0.7.0 (2026-01-22)
 
diff --git a/pyinaturalist_convert/converters.py b/pyinaturalist_convert/converters.py
@@ -24,6 +24,12 @@
     >>> to_json(observations, 'my_observations.json')
     >>> to_parquet(observations, 'my_observations.parquet')
 
+    Or export to any supported format by file extension:
+
+    >>> export(observations, 'my_observations.csv')
+    >>> export(observations, 'my_observations.geojson')
+    >>> export(observations, 'my_observations.gpx')
+
     Load back into Observation objects:
 
     >>> observations = read('my_observations.csv')
@@ -33,11 +39,13 @@
     >>> observations = read('my_observations.json')
     >>> observations = read('my_observations.parquet')
 
+
 **Export functions:**
 
 .. autosummary::
     :nosignatures:
 
+    export
     to_csv
     to_excel
     to_feather
@@ -61,9 +69,10 @@
 import json
 from collections.abc import Iterable, Sequence
 from copy import deepcopy
+from datetime import datetime
 from logging import getLogger
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, TypeAlias
+from typing import TYPE_CHECKING, TypeAlias
 
 from flatten_dict import flatten, unflatten
 from pyinaturalist import BaseModel, JsonResponse, ModelObjects, Observation, ResponseResult, Taxon
@@ -147,7 +156,7 @@ def to_dicts(value: InputTypes) -> Iterable[dict]:
         return [value]  # type: ignore [list-item]
 
 
-def to_csv(observations: AnyObservations, filename: Optional[str] = None):
+def to_csv(observations: AnyObservations, filename: PathOrStr):
     """Convert observations to CSV"""
     from pandas import DataFrame
 
@@ -178,30 +187,31 @@ def to_dataset(observations: AnyObservations) -> Dataset:
     return dataset
 
 
-def to_excel(observations: AnyObservations, filename: str):
+def to_excel(observations: AnyObservations, filename: PathOrStr):
     """Convert observations to an Excel spreadsheet (xlsx)"""
-    xlsx_observations = to_dataset(observations).get_xlsx()
-    write(xlsx_observations, filename, 'wb')
+    dataset = to_dataset(observations)
+    dataset = _strip_tzinfo(dataset)
+    write(dataset.get_xlsx(), filename, 'wb')
 
 
-def to_feather(observations: AnyObservations, filename: str):
+def to_feather(observations: AnyObservations, filename: PathOrStr):
     """Convert observations into a Feather file"""
     df = to_dataframe(observations)
     df.to_feather(filename)
 
 
-def to_hdf(observations: AnyObservations, filename: str):
+def to_hdf(observations: AnyObservations, filename: PathOrStr):
     """Convert observations into a HDF5 file"""
     df = to_dataframe(observations)
     df.to_hdf(filename, 'observations')
 
 
-def to_json(observations: AnyObservations, filename: str):
+def to_json(observations: AnyObservations, filename: PathOrStr):
     """Convert observations into a JSON file"""
-    write(json.dumps(observations, indent=2, default=str), filename)
+    write(json.dumps(to_dicts(observations), indent=2, default=str), filename)
 
 
-def to_parquet(observations: AnyObservations, filename: str):
+def to_parquet(observations: AnyObservations, filename: PathOrStr):
     """Convert observations into a Parquet file"""
     df = to_dataframe(observations)
     df.to_parquet(filename)
@@ -225,43 +235,89 @@ def read(filename: PathOrStr) -> list[Observation]:
     import pandas as pd
 
     from .csv import is_csv_export, load_csv_exports
+    from .db import get_db_observations
     from .dwc import dwc_to_observations
     from .geojson import geojson_to_observations
     from .gpx import gpx_to_observations
 
     file_path = Path(filename).expanduser()
     ext = file_path.suffix.lower().replace('.', '')
-    if ext == 'json':
-        return Observation.from_json_file(file_path)
-    elif ext == 'dwc':
-        return dwc_to_observations(file_path)
-    elif ext == 'geojson':
-        return geojson_to_observations(file_path)
-    elif ext == 'gpx':
-        return gpx_to_observations(file_path)
-    elif ext in ('sqlite', 'db'):
-        from .db import get_db_observations
-
-        return list(get_db_observations(file_path))
-    # For CSV, check if it came from the export tool or from API results
-    elif ext == 'csv' and is_csv_export(file_path):
-        df = load_csv_exports(file_path)
-    elif ext == 'csv':
-        df = pd.read_csv(file_path)
-    elif ext == 'feather':
-        df = pd.read_feather(file_path)
-    elif ext == 'hdf':
-        df = pd.read_hdf(file_path, 'observations')
-    elif ext == 'parquet':
-        df = pd.read_parquet(file_path)
-    elif ext == 'xlsx':
-        df = pd.read_excel(file_path)
-    else:
-        raise ValueError(f'File format not yet supported: {file_path.suffix}')
+    match ext:
+        case 'json':
+            return Observation.from_json_file(file_path)
+        case 'dwc':
+            return dwc_to_observations(file_path)
+        case 'geojson':
+            return geojson_to_observations(file_path)
+        case 'gpx':
+            return gpx_to_observations(file_path)
+        case 'sqlite' | 'db':
+            return list(get_db_observations(file_path))
+        # For CSV, check if it came from the export tool or from API results
+        case 'csv':
+            df = load_csv_exports(file_path) if is_csv_export(file_path) else pd.read_csv(file_path)
+        case 'feather':
+            df = pd.read_feather(file_path)
+        case 'hdf':
+            df = pd.read_hdf(file_path, 'observations')
+        case 'parquet':
+            df = pd.read_parquet(file_path)
+        case 'xlsx':
+            df = pd.read_excel(file_path)
+        case _:
+            raise ValueError(f'File format not yet supported: {file_path.suffix}')
 
     return Observation.from_json_list(_df_to_dicts(df))
 
 
+def export(observations: AnyObservations, filename: PathOrStr):
+    """Export observations to any of the following file formats, based on file extension:
+
+    * CSV (``.csv``)
+    * Darwin Core (``.dwc``)
+    * Feather (``.feather``)
+    * GeoJSON (``.geojson``)
+    * GPX (``.gpx``)
+    * HDF5 (``.hdf``)
+    * JSON (``.json``)
+    * Parquet (``.parquet``)
+    * Excel (``.xlsx``)
+    * SQLite (``.sqlite`` or ``.db``)
+    """
+    from .db import create_tables, save_observations
+    from .dwc import to_dwc
+    from .geojson import to_geojson
+    from .gpx import to_gpx
+
+    file_path = Path(filename).expanduser()
+    ext = file_path.suffix.lower().replace('.', '')
+
+    match ext:
+        case 'json':
+            to_json(observations, file_path)
+        case 'csv':
+            to_csv(observations, file_path)
+        case 'dwc':
+            to_dwc(observations, file_path)
+        case 'feather':
+            to_feather(observations, file_path)
+        case 'geojson':
+            to_geojson(observations, file_path)
+        case 'gpx':
+            to_gpx(observations, file_path)
+        case 'hdf':
+            to_hdf(observations, file_path)
+        case 'parquet':
+            to_parquet(observations, file_path)
+        case 'xlsx':
+            to_excel(observations, file_path)
+        case 'sqlite' | 'db':
+            create_tables(file_path)
+            save_observations(observations, file_path)
+        case _:
+            raise ValueError(f'File format not supported: {ext}')
+
+
 def write(content: str | bytes, filename: PathOrStr, mode='w'):
     """Write converted observation data to a file, creating parent dirs first"""
     logger.info(f'Writing to {filename}')
@@ -388,6 +444,21 @@ def _fix_dimensions(flat_observations):
     return sorted(headers), flat_observations
 
 
+def _strip_tzinfo(dataset: Dataset) -> Dataset:
+    """Strip timezone info from all datetime values in a tablib Dataset, for compatibility with
+    openpyxl
+    """
+    for col in dataset.headers or []:
+        col_idx = dataset.headers.index(col)
+        for row_idx, row in enumerate(dataset):
+            val = row[col_idx]
+            if isinstance(val, datetime) and val.tzinfo is not None:
+                dataset[row_idx] = tuple(
+                    v.replace(tzinfo=None) if i == col_idx else v for i, v in enumerate(row)
+                )
+    return dataset
+
+
 def _is_dataframe(obj) -> bool:
     try:
         from pandas import DataFrame
diff --git a/pyinaturalist_convert/csv.py b/pyinaturalist_convert/csv.py
@@ -103,7 +103,6 @@
 logger = getLogger(__name__)
 
 
-# TODO: Use pandas if installed, otherwise fallback to tablib?
 def load_csv_exports(*file_paths: PathOrStr) -> 'DataFrame':
     """Read one or more CSV files from the Nat export tool into a dataframe
 
diff --git a/pyinaturalist_convert/geojson.py b/pyinaturalist_convert/geojson.py
@@ -67,7 +67,7 @@ def to_geojson(
     )
 
     if filename:
-        write(json.dumps(feature_collection, indent=2), filename)
+        write(json.dumps(feature_collection, indent=2, default=str), filename)
         return None
     else:
         return feature_collection
@@ -111,10 +111,15 @@ def _to_geojson_feature(
 ) -> 'Feature':
     from geojson import Feature, Point
 
-    # Add geometry
-    if not observation.get('geojson'):
-        raise ValueError('Observation without coordinates')
-    point = Point([float(coord) for coord in observation['geojson']['coordinates']])
+    # Add geometry; use location tuple if not already present
+    geojson = observation.get('geojson')
+    if not geojson:
+        location = observation.get('location')
+        if not location:
+            raise ValueError('Observation without coordinates')
+        # location is [lat, lon]; GeoJSON coordinates are [lon, lat]
+        geojson = {'coordinates': [float(location[1]), float(location[0])]}
+    point = Point([float(coord) for coord in geojson['coordinates']])
 
     # Add properties
     flat_obs = flatten_observations([observation])[0]
diff --git a/pyinaturalist_convert/gpx.py b/pyinaturalist_convert/gpx.py
@@ -26,7 +26,7 @@
 from pyinaturalist.constants import ResponseResult
 from pyinaturalist.converters import convert_observation_timestamps
 
-from .converters import AnyObservations, to_dicts, write
+from .converters import AnyObservations, PathOrStr, to_dicts, write
 
 if TYPE_CHECKING:
     from gpxpy.geo import Location
@@ -36,7 +36,7 @@
 
 
 def to_gpx(
-    observations: AnyObservations, filename: Optional[str] = None, waypoints: bool = False
+    observations: AnyObservations, filename: Optional[PathOrStr] = None, waypoints: bool = False
 ) -> Optional['GPX']:
     """Convert a list of observations to a GPX track (default) or a set of GPX waypoints.
 
@@ -72,7 +72,7 @@ def to_gpx(
         return gpx
 
 
-def gpx_to_observations(filename) -> list[Observation]:
+def gpx_to_observations(filename: PathOrStr) -> list[Observation]:
     """Load observations from a GPX file.
 
     Args:
diff --git a/test/test_converters.py b/test/test_converters.py
@@ -5,6 +5,8 @@
 from pyinaturalist import Observation, Taxon, User
 
 from pyinaturalist_convert.converters import (
+    _strip_tzinfo,
+    export,
     read,
     to_csv,
     to_dataframe,
@@ -77,6 +79,36 @@ def test_read__sqlite(tmp_path):
     assert observations[0].id == 45524803
 
 
+@pytest.mark.parametrize(
+    'ext', ['csv', 'db', 'dwc', 'feather', 'geojson', 'gpx', 'json', 'parquet', 'sqlite', 'xlsx']
+)
+def test_export(tmp_path, ext):
+    observations = Observation.from_json_list(load_sample_data('observation.json'))
+    file_path = tmp_path / f'observations.{ext}'
+    export(observations, file_path)
+    result = read(file_path)
+    assert len(result) >= 1
+    assert isinstance(result[0], Observation)
+
+
+def test_export__unsupported_format(tmp_path):
+    observations = load_sample_data('observation.json')
+    with pytest.raises(ValueError, match='File format not supported: txt'):
+        export(observations, tmp_path / 'observations.txt')
+
+
+def test_strip_tzinfo():
+    observations = Observation.from_json_list(load_sample_data('observations.json'))
+    dataset = to_dataset(observations)
+    # Confirm datetime columns are tz-aware before stripping
+    assert dataset['created_at'][0].tzinfo is not None
+
+    _strip_tzinfo(dataset)
+    assert all(v.tzinfo is None for v in dataset['created_at'])
+    assert all(v.tzinfo is None for v in dataset['observed_on'])
+    assert all(v.tzinfo is None for v in dataset['updated_at'])
+
+
 def test_to_dataset():
     observations = Observation.from_json_list(load_sample_data('observations.json'))
     dataset = to_dataset(observations)