Skip to content

Commit 04d8fa1

Browse files
authored
Merge pull request #32 from quantifyearth/mwd-gbif-improvements
GBIF occurence data collection improvements
2 parents f0289a0 + f3ee332 commit 04d8fa1

File tree

2 files changed

+67
-38
lines changed

2 files changed

+67
-38
lines changed

CHANGES.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## v1.1.0 (4/11/2025)
1+
## v1.1.0 (11/11/2025)
22

33
### Added
44

@@ -8,6 +8,7 @@
88

99
* Performance improvements and simplification to habitat processing.
1010
* Store more analysis data from model validation.
11+
* Improve performance of GBIF occurence data fetches.
1112

1213
### Fixed
1314

aoh/validation/fetch_gbif_data.py

Lines changed: 65 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,36 @@
1414
def generate_iucn_to_gbif_map(
1515
collated_data_path: Path,
1616
output_dir_path: Path,
17+
taxa: str,
1718
) -> pd.DataFrame:
1819
collated_data = pd.read_csv(collated_data_path)
1920

2021
# To save spamming the GBIF API, see if there's already a map
2122
# and if so we just request GBIF IDs for data we've not seen before
2223
map_filename = output_dir_path / "map.csv"
23-
id_map : Dict[int,Tuple[str,str,int,Optional[int]]] = {}
24+
id_map : Dict[int,Tuple[str,str,int,Optional[int],str]] = {}
2425
try:
2526
existing_map = pd.read_csv(map_filename)
2627
for _, row in existing_map.iterrows():
27-
id_map[row.iucn_taxon_id] = (row.iucn_taxon_id, row.scientific_name, row.assessment_year, row.gbif_id)
28+
id_map[row.iucn_taxon_id] = (
29+
row.iucn_taxon_id,
30+
row.scientific_name,
31+
row.assessment_year,
32+
row.gbif_id,
33+
row.class_name
34+
)
2835
except (AttributeError, FileNotFoundError):
2936
pass
3037

3138
# First we make a map
32-
for _, row in collated_data.iterrows():
39+
for _, row in collated_data[collated_data.class_name==taxa].iterrows():
3340
taxon_id = row.id_no
3441
if taxon_id in id_map:
42+
print(f"skipping {taxon_id}")
3543
continue
3644
assessment_year = row.assessment_year
3745
scientific_name = row.scientific_name
46+
class_name = row.class_name
3847

3948
if not assessment_year:
4049
continue
@@ -47,27 +56,28 @@ def generate_iucn_to_gbif_map(
4756
raise ValueError("no match found")
4857
gbif_id = result["usageKey"]
4958

50-
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, int(gbif_id))
59+
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, int(gbif_id), class_name)
5160
except (KeyError, ValueError):
52-
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, None)
53-
except requests.exceptions.ConnectionError:
61+
id_map[taxon_id] = (taxon_id, scientific_name, assessment_year, None, class_name)
62+
except requests.exceptions.ConnectionError as exc:
5463
# GBIF is not longer happy to talk to us? We should cache whatever data we already
5564
# have and give up
5665
map_data = id_map.values()
5766
map_df = pd.DataFrame(
5867
map_data,
59-
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id"],
68+
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id", "class_name"],
6069
)
6170
map_df["gbif_id"] = map_df["gbif_id"].astype('Int64')
6271
map_df.to_csv(map_filename, index=False)
72+
print(exc)
6373
sys.exit("Connection error from GBIF, aborting.")
6474

65-
time.sleep(0.1) # rate limiting
75+
time.sleep(0.5) # rate limiting
6676

6777
map_data = id_map.values()
6878
map_df = pd.DataFrame(
6979
map_data,
70-
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id"],
80+
columns=["iucn_taxon_id", "scientific_name", "assessment_year", "gbif_id", "class_name"],
7181
)
7282
map_df["gbif_id"] = map_df["gbif_id"].astype('Int64')
7383
map_df.to_csv(map_filename, index=False)
@@ -76,40 +86,49 @@ def generate_iucn_to_gbif_map(
7686

7787
def build_gbif_query(id_map: pd.DataFrame) -> Any:
7888

79-
map_with_gbif_id = id_map[id_map.gbif_id is not None]
89+
map_with_gbif_id = id_map[id_map.gbif_id.notna()]
90+
request_data = map_with_gbif_id[["assessment_year", "gbif_id"]]
91+
92+
# There should be tens of assessment years vs thousands of species, so we can use that to reduce the query count:
93+
grouped = request_data.groupby('assessment_year')['gbif_id'].apply(list)
8094

8195
queries = [
8296
{
8397
"type": "and",
8498
"predicates": [
8599
{
86-
"type": "equals",
100+
"type": "in",
87101
"key": "TAXON_KEY",
88-
"value": int(gbif_id),
102+
"values": [str(int(gbif_id)) for gbif_id in gbif_ids]
89103
},
90104
{
91105
"type": "greaterThan",
92106
"key": "YEAR",
93-
"value": int(assessment_year),
107+
"value": str(int(assessment_year)), # type: ignore
94108
},
95-
{
96-
"type": "equals",
97-
"key": "HAS_COORDINATE",
98-
"value": "TRUE"
99-
},
100-
{
101-
"type": "equals",
102-
"key": "HAS_GEOSPATIAL_ISSUE",
103-
"value": "FALSE"
104-
}
105109
]
106110
}
107-
for _, _, assessment_year, gbif_id in map_with_gbif_id.itertuples(index=False)
111+
for assessment_year, gbif_ids in grouped.items()
108112
]
109113

110114
return {
111-
"type": "or",
112-
"predicates": queries
115+
"type": "and",
116+
"predicates": [
117+
{
118+
"type": "or",
119+
"predicates": queries
120+
},
121+
{
122+
"type": "equals",
123+
"key": "HAS_COORDINATE",
124+
"value": "TRUE"
125+
},
126+
{
127+
"type": "equals",
128+
"key": "HAS_GEOSPATIAL_ISSUE",
129+
"value": "FALSE"
130+
},
131+
]
113132
}
114133

115134
def build_point_validation_table(
@@ -125,19 +144,21 @@ def build_point_validation_table(
125144

126145
def fetch_gbif_data(
127146
collated_data_path: Path,
128-
gbif_username : str,
147+
taxa: str,
148+
gbif_username: str,
129149
gbif_email: str,
130150
gbif_password: str,
131-
output_dir_path: Path,
151+
toplevel_output_dir_path: Path,
132152
) -> None:
133-
final_result_path = output_dir_path / "points.csv"
153+
taxa_output_dir_path = toplevel_output_dir_path / taxa
154+
final_result_path = taxa_output_dir_path / "points.csv"
134155
if final_result_path.exists():
135156
return
136157

137-
os.makedirs(output_dir_path, exist_ok=True)
138-
download_key_cache_filename = output_dir_path / "download_key"
158+
os.makedirs(taxa_output_dir_path, exist_ok=True)
159+
download_key_cache_filename = taxa_output_dir_path / "download_key"
139160

140-
map_df = generate_iucn_to_gbif_map(collated_data_path, output_dir_path)
161+
map_df = generate_iucn_to_gbif_map(collated_data_path, taxa_output_dir_path, taxa)
141162
if map_df is None or len(map_df) == 0:
142163
sys.exit("No specices in GBIF ID list, aborting")
143164

@@ -147,16 +168,16 @@ def fetch_gbif_data(
147168
request.add_predicate_dict(query)
148169

149170
download_key = request.post_download(gbif_username, gbif_password)
150-
download_key_cache_filename = output_dir_path / "download_key"
171+
download_key_cache_filename = taxa_output_dir_path / "download_key"
151172
with open(download_key_cache_filename, "w", encoding="UTF-8") as f:
152173
f.write(download_key)
153174
else:
154175
with open(download_key_cache_filename, "r", encoding="UTF-8") as f:
155176
download_key = f.read()
156177

157-
expected_csv = output_dir_path / f"{download_key}.csv"
178+
expected_csv = taxa_output_dir_path / f"{download_key}.csv"
158179
if not expected_csv.exists():
159-
expected_download = output_dir_path / f"{download_key}.zip"
180+
expected_download = taxa_output_dir_path / f"{download_key}.zip"
160181
if not expected_download.exists():
161182
while True:
162183
metadata = pygbif.occurrences.download_meta(download_key)
@@ -166,13 +187,13 @@ def fetch_gbif_data(
166187
time.sleep(30.0)
167188
continue
168189
case "SUCCEEDED":
169-
file_path = pygbif.occurrences.download_get(download_key, path=output_dir_path)
190+
file_path = pygbif.occurrences.download_get(download_key, path=taxa_output_dir_path)
170191
print(f"Results are in {file_path}")
171192
break
172193
case _:
173194
sys.exit(f"Failed to download data, status: {metadata['status']}")
174195
with zipfile.ZipFile(expected_download, 'r') as zip_file:
175-
zip_file.extractall(output_dir_path)
196+
zip_file.extractall(taxa_output_dir_path)
176197
if not expected_csv.exists():
177198
sys.exit("Extracted GBIF zip did not contain expected CSV file")
178199

@@ -221,6 +242,12 @@ def main() -> None:
221242
help="Password of user's GBIF account. Can also be set in environment.",
222243
dest="gbif_password",
223244
)
245+
parser.add_argument(
246+
'--taxa',
247+
type=str,
248+
required=True,
249+
dest='taxa',
250+
)
224251
parser.add_argument(
225252
"--output_dir",
226253
type=Path,
@@ -239,6 +266,7 @@ def main() -> None:
239266

240267
fetch_gbif_data(
241268
args.collated_data_path,
269+
args.taxa,
242270
args.gbif_username,
243271
args.gbif_email,
244272
args.gbif_password,

0 commit comments

Comments
 (0)