Skip to content

Commit e8ce31b

Browse files
authored
Add export() function that wraps writing to all supported formats (based on file extension) (#214)
Plus fixes for various bugs uncovered by tests: - Strip timezone info from all datetime values before saving to xlsx (for compatibility with openpyxl) - Fix issues with some observation formats passed to `to_json()` - Update `to_geojson()` to use location tuple if available
2 parents 36c2f11 + f304797 commit e8ce31b

File tree

6 files changed

+157
-46
lines changed

6 files changed

+157
-46
lines changed

HISTORY.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44

55
- Add database migrations with Alembic and include in PyPI package
66
- Add a `migrate()` helper for downstream libraries
7+
- Add a unified `export()` function to write observations to any supported format (based on file extension)
78
- Extend `read()` to support GeoJSON, GPX, DwC, and SQLite formats
89
- Improve CSV loading performance
910
- Rewrite taxonomy aggregation for significantly improved performance
1011
- Optimize DwC-A table loading (delay index creation until after all rows are inserted)
1112
- Raise a more descriptive error when SQLAlchemy is missing
13+
- Strip timezone info from all datetime values before saving to xlsx (for compatibility with openpyxl)
14+
- Fix issues with some observation formats passed to `to_json()`
15+
- Update `to_geojson()` to use `location` tuple if available
1216

1317
## 0.7.0 (2026-01-22)
1418

pyinaturalist_convert/converters.py

Lines changed: 108 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424
>>> to_json(observations, 'my_observations.json')
2525
>>> to_parquet(observations, 'my_observations.parquet')
2626
27+
Or export to any supported format by file extension:
28+
29+
>>> export(observations, 'my_observations.csv')
30+
>>> export(observations, 'my_observations.geojson')
31+
>>> export(observations, 'my_observations.gpx')
32+
2733
Load back into Observation objects:
2834
2935
>>> observations = read('my_observations.csv')
@@ -33,11 +39,13 @@
3339
>>> observations = read('my_observations.json')
3440
>>> observations = read('my_observations.parquet')
3541
42+
3643
**Export functions:**
3744
3845
.. autosummary::
3946
:nosignatures:
4047
48+
export
4149
to_csv
4250
to_excel
4351
to_feather
@@ -61,9 +69,10 @@
6169
import json
6270
from collections.abc import Iterable, Sequence
6371
from copy import deepcopy
72+
from datetime import datetime
6473
from logging import getLogger
6574
from pathlib import Path
66-
from typing import TYPE_CHECKING, Optional, TypeAlias
75+
from typing import TYPE_CHECKING, TypeAlias
6776

6877
from flatten_dict import flatten, unflatten
6978
from pyinaturalist import BaseModel, JsonResponse, ModelObjects, Observation, ResponseResult, Taxon
@@ -147,7 +156,7 @@ def to_dicts(value: InputTypes) -> Iterable[dict]:
147156
return [value] # type: ignore [list-item]
148157

149158

150-
def to_csv(observations: AnyObservations, filename: Optional[str] = None):
159+
def to_csv(observations: AnyObservations, filename: PathOrStr):
151160
"""Convert observations to CSV"""
152161
from pandas import DataFrame
153162

@@ -178,30 +187,31 @@ def to_dataset(observations: AnyObservations) -> Dataset:
178187
return dataset
179188

180189

181-
def to_excel(observations: AnyObservations, filename: str):
190+
def to_excel(observations: AnyObservations, filename: PathOrStr):
182191
"""Convert observations to an Excel spreadsheet (xlsx)"""
183-
xlsx_observations = to_dataset(observations).get_xlsx()
184-
write(xlsx_observations, filename, 'wb')
192+
dataset = to_dataset(observations)
193+
dataset = _strip_tzinfo(dataset)
194+
write(dataset.get_xlsx(), filename, 'wb')
185195

186196

187-
def to_feather(observations: AnyObservations, filename: str):
197+
def to_feather(observations: AnyObservations, filename: PathOrStr):
188198
"""Convert observations into a Feather file"""
189199
df = to_dataframe(observations)
190200
df.to_feather(filename)
191201

192202

193-
def to_hdf(observations: AnyObservations, filename: str):
203+
def to_hdf(observations: AnyObservations, filename: PathOrStr):
194204
"""Convert observations into a HDF5 file"""
195205
df = to_dataframe(observations)
196206
df.to_hdf(filename, 'observations')
197207

198208

199-
def to_json(observations: AnyObservations, filename: str):
209+
def to_json(observations: AnyObservations, filename: PathOrStr):
200210
"""Convert observations into a JSON file"""
201-
write(json.dumps(observations, indent=2, default=str), filename)
211+
write(json.dumps(to_dicts(observations), indent=2, default=str), filename)
202212

203213

204-
def to_parquet(observations: AnyObservations, filename: str):
214+
def to_parquet(observations: AnyObservations, filename: PathOrStr):
205215
"""Convert observations into a Parquet file"""
206216
df = to_dataframe(observations)
207217
df.to_parquet(filename)
@@ -225,43 +235,89 @@ def read(filename: PathOrStr) -> list[Observation]:
225235
import pandas as pd
226236

227237
from .csv import is_csv_export, load_csv_exports
238+
from .db import get_db_observations
228239
from .dwc import dwc_to_observations
229240
from .geojson import geojson_to_observations
230241
from .gpx import gpx_to_observations
231242

232243
file_path = Path(filename).expanduser()
233244
ext = file_path.suffix.lower().replace('.', '')
234-
if ext == 'json':
235-
return Observation.from_json_file(file_path)
236-
elif ext == 'dwc':
237-
return dwc_to_observations(file_path)
238-
elif ext == 'geojson':
239-
return geojson_to_observations(file_path)
240-
elif ext == 'gpx':
241-
return gpx_to_observations(file_path)
242-
elif ext in ('sqlite', 'db'):
243-
from .db import get_db_observations
244-
245-
return list(get_db_observations(file_path))
246-
# For CSV, check if it came from the export tool or from API results
247-
elif ext == 'csv' and is_csv_export(file_path):
248-
df = load_csv_exports(file_path)
249-
elif ext == 'csv':
250-
df = pd.read_csv(file_path)
251-
elif ext == 'feather':
252-
df = pd.read_feather(file_path)
253-
elif ext == 'hdf':
254-
df = pd.read_hdf(file_path, 'observations')
255-
elif ext == 'parquet':
256-
df = pd.read_parquet(file_path)
257-
elif ext == 'xlsx':
258-
df = pd.read_excel(file_path)
259-
else:
260-
raise ValueError(f'File format not yet supported: {file_path.suffix}')
245+
match ext:
246+
case 'json':
247+
return Observation.from_json_file(file_path)
248+
case 'dwc':
249+
return dwc_to_observations(file_path)
250+
case 'geojson':
251+
return geojson_to_observations(file_path)
252+
case 'gpx':
253+
return gpx_to_observations(file_path)
254+
case 'sqlite' | 'db':
255+
return list(get_db_observations(file_path))
256+
# For CSV, check if it came from the export tool or from API results
257+
case 'csv':
258+
df = load_csv_exports(file_path) if is_csv_export(file_path) else pd.read_csv(file_path)
259+
case 'feather':
260+
df = pd.read_feather(file_path)
261+
case 'hdf':
262+
df = pd.read_hdf(file_path, 'observations')
263+
case 'parquet':
264+
df = pd.read_parquet(file_path)
265+
case 'xlsx':
266+
df = pd.read_excel(file_path)
267+
case _:
268+
raise ValueError(f'File format not yet supported: {file_path.suffix}')
261269

262270
return Observation.from_json_list(_df_to_dicts(df))
263271

264272

273+
def export(observations: AnyObservations, filename: PathOrStr):
274+
"""Export observations to any of the following file formats, based on file extension:
275+
276+
* CSV (``.csv``)
277+
* Darwin Core (``.dwc``)
278+
* Feather (``.feather``)
279+
* GeoJSON (``.geojson``)
280+
* GPX (``.gpx``)
281+
* HDF5 (``.hdf``)
282+
* JSON (``.json``)
283+
* Parquet (``.parquet``)
284+
* Excel (``.xlsx``)
285+
* SQLite (``.sqlite`` or ``.db``)
286+
"""
287+
from .db import create_tables, save_observations
288+
from .dwc import to_dwc
289+
from .geojson import to_geojson
290+
from .gpx import to_gpx
291+
292+
file_path = Path(filename).expanduser()
293+
ext = file_path.suffix.lower().replace('.', '')
294+
295+
match ext:
296+
case 'json':
297+
to_json(observations, file_path)
298+
case 'csv':
299+
to_csv(observations, file_path)
300+
case 'dwc':
301+
to_dwc(observations, file_path)
302+
case 'feather':
303+
to_feather(observations, file_path)
304+
case 'geojson':
305+
to_geojson(observations, file_path)
306+
case 'gpx':
307+
to_gpx(observations, file_path)
308+
case 'hdf':
309+
to_hdf(observations, file_path)
310+
case 'parquet':
311+
to_parquet(observations, file_path)
312+
case 'xlsx':
313+
to_excel(observations, file_path)
314+
case 'sqlite' | 'db':
315+
create_tables(file_path)
316+
save_observations(observations, file_path)
317+
case _:
318+
raise ValueError(f'File format not supported: {ext}')
319+
320+
265321
def write(content: str | bytes, filename: PathOrStr, mode='w'):
266322
"""Write converted observation data to a file, creating parent dirs first"""
267323
logger.info(f'Writing to {filename}')
@@ -388,6 +444,21 @@ def _fix_dimensions(flat_observations):
388444
return sorted(headers), flat_observations
389445

390446

447+
def _strip_tzinfo(dataset: Dataset) -> Dataset:
448+
"""Strip timezone info from all datetime values in a tablib Dataset, for compatibility with
449+
openpyxl
450+
"""
451+
for col in dataset.headers or []:
452+
col_idx = dataset.headers.index(col)
453+
for row_idx, row in enumerate(dataset):
454+
val = row[col_idx]
455+
if isinstance(val, datetime) and val.tzinfo is not None:
456+
dataset[row_idx] = tuple(
457+
v.replace(tzinfo=None) if i == col_idx else v for i, v in enumerate(row)
458+
)
459+
return dataset
460+
461+
391462
def _is_dataframe(obj) -> bool:
392463
try:
393464
from pandas import DataFrame

pyinaturalist_convert/csv.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@
103103
logger = getLogger(__name__)
104104

105105

106-
# TODO: Use pandas if installed, otherwise fallback to tablib?
107106
def load_csv_exports(*file_paths: PathOrStr) -> 'DataFrame':
108107
"""Read one or more CSV files from the Nat export tool into a dataframe
109108

pyinaturalist_convert/geojson.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def to_geojson(
6767
)
6868

6969
if filename:
70-
write(json.dumps(feature_collection, indent=2), filename)
70+
write(json.dumps(feature_collection, indent=2, default=str), filename)
7171
return None
7272
else:
7373
return feature_collection
@@ -111,10 +111,15 @@ def _to_geojson_feature(
111111
) -> 'Feature':
112112
from geojson import Feature, Point
113113

114-
# Add geometry
115-
if not observation.get('geojson'):
116-
raise ValueError('Observation without coordinates')
117-
point = Point([float(coord) for coord in observation['geojson']['coordinates']])
114+
# Add geometry; use location tuple if not already present
115+
geojson = observation.get('geojson')
116+
if not geojson:
117+
location = observation.get('location')
118+
if not location:
119+
raise ValueError('Observation without coordinates')
120+
# location is [lat, lon]; GeoJSON coordinates are [lon, lat]
121+
geojson = {'coordinates': [float(location[1]), float(location[0])]}
122+
point = Point([float(coord) for coord in geojson['coordinates']])
118123

119124
# Add properties
120125
flat_obs = flatten_observations([observation])[0]

pyinaturalist_convert/gpx.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from pyinaturalist.constants import ResponseResult
2727
from pyinaturalist.converters import convert_observation_timestamps
2828

29-
from .converters import AnyObservations, to_dicts, write
29+
from .converters import AnyObservations, PathOrStr, to_dicts, write
3030

3131
if TYPE_CHECKING:
3232
from gpxpy.geo import Location
@@ -36,7 +36,7 @@
3636

3737

3838
def to_gpx(
39-
observations: AnyObservations, filename: Optional[str] = None, waypoints: bool = False
39+
observations: AnyObservations, filename: Optional[PathOrStr] = None, waypoints: bool = False
4040
) -> Optional['GPX']:
4141
"""Convert a list of observations to a GPX track (default) or a set of GPX waypoints.
4242
@@ -72,7 +72,7 @@ def to_gpx(
7272
return gpx
7373

7474

75-
def gpx_to_observations(filename) -> list[Observation]:
75+
def gpx_to_observations(filename: PathOrStr) -> list[Observation]:
7676
"""Load observations from a GPX file.
7777
7878
Args:

test/test_converters.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from pyinaturalist import Observation, Taxon, User
66

77
from pyinaturalist_convert.converters import (
8+
_strip_tzinfo,
9+
export,
810
read,
911
to_csv,
1012
to_dataframe,
@@ -77,6 +79,36 @@ def test_read__sqlite(tmp_path):
7779
assert observations[0].id == 45524803
7880

7981

82+
@pytest.mark.parametrize(
83+
'ext', ['csv', 'db', 'dwc', 'feather', 'geojson', 'gpx', 'json', 'parquet', 'sqlite', 'xlsx']
84+
)
85+
def test_export(tmp_path, ext):
86+
observations = Observation.from_json_list(load_sample_data('observation.json'))
87+
file_path = tmp_path / f'observations.{ext}'
88+
export(observations, file_path)
89+
result = read(file_path)
90+
assert len(result) >= 1
91+
assert isinstance(result[0], Observation)
92+
93+
94+
def test_export__unsupported_format(tmp_path):
95+
observations = load_sample_data('observation.json')
96+
with pytest.raises(ValueError, match='File format not supported: txt'):
97+
export(observations, tmp_path / 'observations.txt')
98+
99+
100+
def test_strip_tzinfo():
101+
observations = Observation.from_json_list(load_sample_data('observations.json'))
102+
dataset = to_dataset(observations)
103+
# Confirm datetime columns are tz-aware before stripping
104+
assert dataset['created_at'][0].tzinfo is not None
105+
106+
_strip_tzinfo(dataset)
107+
assert all(v.tzinfo is None for v in dataset['created_at'])
108+
assert all(v.tzinfo is None for v in dataset['observed_on'])
109+
assert all(v.tzinfo is None for v in dataset['updated_at'])
110+
111+
80112
def test_to_dataset():
81113
observations = Observation.from_json_list(load_sample_data('observations.json'))
82114
dataset = to_dataset(observations)

0 commit comments

Comments
 (0)