Skip to content

Commit 4dcfbff

Browse files
authored
Merge pull request #26 from quantifyearth/mwd-point-validation
Add script to fetch GBIF data for point validation
2 parents 335aedd + a1f4856 commit 4dcfbff

File tree

4 files changed

+255
-0
lines changed

4 files changed

+255
-0
lines changed

.github/workflows/pull-request.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ jobs:
5757
aoh-endemism --help
5858
aoh-collate-data --help
5959
aoh-validate-prevalence --help
60+
aoh-fetch-gbif-data --help
6061
6162
- name: Test package imports
6263
run: |

aoh/validation/collate_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
COLUMNS = [
1010
"id_no",
1111
"assessment_id",
12+
"assessment_year",
1213
"class_name",
1314
"family_name",
1415
"scientific_name",

aoh/validation/fetch_gbif_data.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
import argparse
2+
import os
3+
import sys
4+
import time
5+
import zipfile
6+
from pathlib import Path
7+
from typing import Any, Dict, Optional, Tuple
8+
9+
import pandas as pd
10+
import pygbif # type: ignore
11+
import requests
12+
from pygbif.occurrences.download import GbifDownload # type: ignore
13+
14+
def generate_iucn_to_gbif_map(
15+
collated_data_path: Path,
16+
output_dir_path: Path,
17+
) -> pd.DataFrame:
18+
collated_data = pd.read_csv(collated_data_path)
19+
20+
# To save spamming the GBIF API, see if there's already a map
21+
# and if so we just request GBIF IDs for data we've not seen before
22+
map_filename = output_dir_path / "map.csv"
23+
id_map : Dict[int,Tuple[str,str,int,Optional[int]]] = {}
24+
try:
25+
existing_map = pd.read_csv(map_filename)
26+
for _, row in existing_map.iterrows():
27+
id_map[row.iucn_taxon_id] = (row.iucn_taxon_id, row.scientific_name, row.assessment_year, row.gbif_id)
28+
except (AttributeError, FileNotFoundError):
29+
pass
30+
31+
# First we make a map
32+
for _, row in collated_data.iterrows():
33+
taxon_id = row.id_no
34+
if taxon_id in id_map:
35+
continue
36+
assessment_year = row.assessment_year
37+
scientific_name = row.scientific_name
38+
39+
if not assessment_year:
40+
continue
41+
if not scientific_name:
42+
continue
43+
44+
try:
45+
result = pygbif.species.name_backbone(scientific_name, rank='species')
46+
if result["matchType"] not in ["EXACT", "FUZZY"]:
47+
raise ValueError("no match found")
48+
gbif_id = result["usageKey"]
49+
50+
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, int(gbif_id))
51+
except (KeyError, ValueError):
52+
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, None)
53+
except requests.exceptions.ConnectionError:
54+
# GBIF is not longer happy to talk to us? We should cache whatever data we already
55+
# have and give up
56+
map_data = id_map.values()
57+
map_df = pd.DataFrame(
58+
map_data,
59+
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id"],
60+
)
61+
map_df["gbif_id"] = map_df["gbif_id"].astype('Int64')
62+
map_df.to_csv(map_filename, index=False)
63+
sys.exit("Connection error from GBIF, aborting.")
64+
65+
time.sleep(0.1) # rate limiting
66+
67+
map_data = id_map.values()
68+
map_df = pd.DataFrame(
69+
map_data,
70+
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id"],
71+
)
72+
map_df["gbif_id"] = map_df["gbif_id"].astype('Int64')
73+
map_df.to_csv(map_filename, index=False)
74+
75+
return map_df
76+
77+
def build_gbif_query(id_map: pd.DataFrame) -> Any:
78+
79+
map_with_gbif_id = id_map[id_map.gbif_id is not None]
80+
81+
queries = [
82+
{
83+
"type": "and",
84+
"predicates": [
85+
{
86+
"type": "equals",
87+
"key": "TAXON_KEY",
88+
"value": int(gbif_id),
89+
},
90+
{
91+
"type": "greaterThan",
92+
"key": "YEAR",
93+
"value": int(assessment_year),
94+
},
95+
{
96+
"type": "equals",
97+
"key": "HAS_COORDINATE",
98+
"value": "TRUE"
99+
},
100+
{
101+
"type": "equals",
102+
"key": "HAS_GEOSPATIAL_ISSUE",
103+
"value": "FALSE"
104+
}
105+
]
106+
}
107+
for _, _, assessment_year, gbif_id in map_with_gbif_id.itertuples(index=False)
108+
]
109+
110+
return {
111+
"type": "or",
112+
"predicates": queries
113+
}
114+
115+
def build_point_validation_table(
116+
gbif_data_path: Path,
117+
map_df: pd.DataFrame,
118+
output_csv_path: Path,
119+
) -> None:
120+
gbif_data = pd.read_csv(gbif_data_path, sep='\t')
121+
gbif_data.rename(columns={"taxonKey": "gbif_id"}, inplace=True)
122+
updated_data = gbif_data.merge(map_df, on="gbif_id", how='inner')
123+
necessary_columns = updated_data[["iucn_taxon_id", "gbif_id", "decimalLatitude", "decimalLongitude", "year"]]
124+
necessary_columns.to_csv(output_csv_path, index=False)
125+
126+
def fetch_gbif_data(
127+
collated_data_path: Path,
128+
gbif_username : str,
129+
gbif_email: str,
130+
gbif_password: str,
131+
output_dir_path: Path,
132+
) -> None:
133+
final_result_path = output_dir_path / "points.csv"
134+
if final_result_path.exists():
135+
return
136+
137+
os.makedirs(output_dir_path, exist_ok=True)
138+
download_key_cache_filename = output_dir_path / "download_key"
139+
140+
map_df = generate_iucn_to_gbif_map(collated_data_path, output_dir_path)
141+
if map_df is None or len(map_df) == 0:
142+
sys.exit("No specices in GBIF ID list, aborting")
143+
144+
if not download_key_cache_filename.exists():
145+
request = GbifDownload(gbif_username, gbif_email)
146+
query = build_gbif_query(map_df)
147+
request.add_predicate_dict(query)
148+
149+
download_key = request.post_download(gbif_username, gbif_password)
150+
download_key_cache_filename = output_dir_path / "download_key"
151+
with open(download_key_cache_filename, "w", encoding="UTF-8") as f:
152+
f.write(download_key)
153+
else:
154+
with open(download_key_cache_filename, "r", encoding="UTF-8") as f:
155+
download_key = f.read()
156+
157+
expected_csv = output_dir_path / f"{download_key}.csv"
158+
if not expected_csv.exists():
159+
expected_download = output_dir_path / f"{download_key}.zip"
160+
if not expected_download.exists():
161+
while True:
162+
metadata = pygbif.occurrences.download_meta(download_key)
163+
match metadata["status"]:
164+
case "PREPARING" | "SUSPENDED" | "RUNNING":
165+
print(f"Download status: {metadata['status']}, sleeping...")
166+
time.sleep(30.0)
167+
continue
168+
case "SUCCEEDED":
169+
file_path = pygbif.occurrences.download_get(download_key, path=output_dir_path)
170+
print(f"Results are in {file_path}")
171+
break
172+
case _:
173+
sys.exit(f"Failed to download data, status: {metadata['status']}")
174+
with zipfile.ZipFile(expected_download, 'r') as zip_file:
175+
zip_file.extractall(output_dir_path)
176+
if not expected_csv.exists():
177+
sys.exit("Extracted GBIF zip did not contain expected CSV file")
178+
179+
build_point_validation_table(
180+
expected_csv,
181+
map_df,
182+
final_result_path,
183+
)
184+
185+
def main() -> None:
186+
parser = argparse.ArgumentParser(
187+
description="Fetch GBIF records for species for validation.",
188+
epilog='''
189+
Environment Variables:
190+
GBIF_USERNAME Username of user's GBIF account.
191+
GBIF_EMAIL E-mail of user's GBIF account.
192+
GBIF_PASSWORD Password of user's GBIF account.
193+
''',
194+
formatter_class=argparse.RawDescriptionHelpFormatter,
195+
)
196+
parser.add_argument(
197+
'--collated_aoh_data',
198+
type=Path,
199+
help="CSV containing collated AoH data",
200+
required=True,
201+
dest="collated_data_path",
202+
)
203+
parser.add_argument(
204+
'--gbif_username',
205+
type=str,
206+
default=os.getenv('GBIF_USERNAME'),
207+
help="Username of user's GBIF account. Can also be set in environment.",
208+
dest="gbif_username",
209+
)
210+
parser.add_argument(
211+
'--gbif_email',
212+
type=str,
213+
default=os.getenv('GBIF_EMAIL'),
214+
help="E-mail of user's GBIF account. Can also be set in environment.",
215+
dest="gbif_email",
216+
)
217+
parser.add_argument(
218+
'--gbif_password',
219+
type=str,
220+
default=os.getenv('GBIF_PASSWORD'),
221+
help="Password of user's GBIF account. Can also be set in environment.",
222+
dest="gbif_password",
223+
)
224+
parser.add_argument(
225+
"--output_dir",
226+
type=Path,
227+
required=True,
228+
dest="output_dir_path",
229+
help="Destination directory for GBIF data.",
230+
)
231+
args = parser.parse_args()
232+
233+
if not args.gbif_username:
234+
parser.error('--gbif_username is required (or set GBIF_USERNAME env var)')
235+
if not args.gbif_email:
236+
parser.error('--gbif_email is required (or set GBIF_EMAIL env var)')
237+
if not args.gbif_password:
238+
parser.error('--gbif_password is required (or set GBIF_PASSWORD env var)')
239+
240+
fetch_gbif_data(
241+
args.collated_data_path,
242+
args.gbif_username,
243+
args.gbif_email,
244+
args.gbif_password,
245+
args.output_dir_path,
246+
)
247+
248+
249+
if __name__ == "__main__":
250+
main()

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,14 @@ dependencies = [
3939
[project.optional-dependencies]
4040
validation = [
4141
"pymer4==0.8.2",
42+
"pygbif",
4243
]
4344
dev = [
4445
"pylint",
4546
"mypy",
4647
"pytest",
4748
"types-psutil",
49+
"types-requests",
4850
"pandas-stubs",
4951
"geojson",
5052
"pytest-cov",
@@ -64,6 +66,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main"
6466
aoh-endemism = "aoh.summaries.endemism:main"
6567
aoh-collate-data = "aoh.validation.collate_data:main"
6668
aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main"
69+
aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main"
6770

6871
[tool.setuptools]
6972
packages = ["aoh", "aoh.summaries", "aoh.validation"]

0 commit comments

Comments
 (0)