quantifyearth · mdales · Nov 20, 2025 · Nov 20, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,7 +8,7 @@
 
 * Performance improvements and simplification to habitat processing.
 * Store more analysis data from model validation.
-* Improve performance of GBIF occurence data fetches.
+* Improve performance of GBIF occurrence data fetches.
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -219,3 +219,62 @@ options:
 This will produce a CSV file listing just the AoH maps that fail model validation.
 
 **Note:** The validation tools require R to be installed on your system with the `lme4` and `lmerTest` packages.
+
+## aoh-fetch-gbif-data
+
+This command fetches occurrence data from [GBIF](https://gbif.org) to do occurrence checking as per Dahal et al.
+
+```bash
+$ aoh-fetch-gbif-data --help
+usage: aoh-fetch-gbif-data [-h] --collated_aoh_data COLLATED_DATA_PATH [--gbif_username GBIF_USERNAME] [--gbif_email GBIF_EMAIL] [--gbif_password GBIF_PASSWORD] --taxa TAXA --output_dir OUTPUT_DIR_PATH
+
+Fetch GBIF records for species for validation.
+
+options:
+  -h, --help            show this help message and exit
+  --collated_aoh_data COLLATED_DATA_PATH
+                        CSV containing collated AoH data
+  --gbif_username GBIF_USERNAME
+                        Username of user's GBIF account. Can also be set in environment.
+  --gbif_email GBIF_EMAIL
+                        E-mail of user's GBIF account. Can also be set in environment.
+  --gbif_password GBIF_PASSWORD
+                        Password of user's GBIF account. Can also be set in environment.
+  --taxa TAXA
+  --output_dir OUTPUT_DIR_PATH
+                        Destination directory for GBIF data.
+
+Environment Variables:
+    GBIF_USERNAME   Username of user's GBIF account.
+    GBIF_EMAIL      E-mail of user's GBIF account.
+    GBIF_PASSWORD   Password of user's GBIF account.
+```
+
+Important notes:
+
+1. You will need a GBIF account for this.
+2. This can take a long time, particularly for birds as there are so many records.
+3. It can also generate a lot of data, hundreds of gigabytes worth, so ensure you have enough storage space!
+
+## aoh-validate-occurrences
+
+This command will run occurrence validation using the GBIF data fetched with the previous command.
+
+```bash
+aoh-validate-occurences --help
+usage: aoh-validate-occurences [-h] --gbif_data_path GBIF_DATA_PATH --species_data SPECIES_DATA_PATH --aoh_results AOHS_PATH --output OUTPUT_PATH [-j PROCESSES_COUNT]
+
+Validate occurrence prevelance.
+
+options:
+  -h, --help            show this help message and exit
+  --gbif_data_path GBIF_DATA_PATH
+                        Data containing downloaded GBIF data.
+  --species_data SPECIES_DATA_PATH
+                        Path of all the species range data.
+  --aoh_results AOHS_PATH
+                        Path of all the AoH outputs.
+  --output OUTPUT_PATH  CSV of outliers.
+  -j PROCESSES_COUNT    Optional number of concurrent threads to use.
+```
+
diff --git a/aoh/validation/README.md b/aoh/validation/README.md
@@ -6,5 +6,5 @@ This directory contains the following scripts:
 
 * `collate_data.py` - When you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
 * `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
-* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
+* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurrence data on GBIF that can be used for point validation as per the Dahal et al paper.
 * `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.
diff --git a/aoh/validation/fetch_gbif_data.py b/aoh/validation/fetch_gbif_data.py
@@ -137,19 +137,56 @@ def build_point_validation_table(
     output_csv_path: Path,
     chunksize: int = 100_000,
 ) -> None:
+    # The challenge here is that the GBIF download format is problematic. For instance, the file extension
+    # is CSV, but the data within is clearly tab separated: https://www.gbif.org/faq?q=csv
+    # Parsing these files with pandas we get warnings of mixed data types in columns which seems to be it
+    # failing to deal with some escaping issues. Thus the read file here tries to be somewhat defensive
+    # but is not perfect.
     first_chunk = True
-    for chunk in pd.read_csv(gbif_data_path, sep='\t', chunksize=chunksize, on_bad_lines='skip'):
+    total_rows = 0
+    skipped_rows = 0
+    for chunk in pd.read_csv(
+        gbif_data_path,
+        sep='\t',             # Is a TSV file despite the CSV file extension
+        chunksize=chunksize,
+        on_bad_lines='skip',  # Skip rows with wrong number of columns
+        low_memory=False,     # Avoid dtype warnings
+        encoding='utf-8',     # GBIF uses UTF-8
+        quotechar='"',        # Standard quote character
+        escapechar='\\',      # Handle escaped characters
+    ):
+        valid_mask = (
+            pd.to_numeric(chunk['speciesKey'], errors='coerce').notna() &
+            pd.to_numeric(chunk['decimalLatitude'], errors='coerce').notna() &
+            pd.to_numeric(chunk['decimalLongitude'], errors='coerce').notna() &
+            pd.to_numeric(chunk['year'], errors='coerce').notna()
+        )
+
+        skipped_rows += (~valid_mask).sum()
+        chunk = chunk[valid_mask]
+
+        # Force correct types once the invalid data has been removed
+        chunk['speciesKey'] = pd.to_numeric(chunk['speciesKey'], errors='coerce')
+        chunk['decimalLatitude'] = pd.to_numeric(chunk['decimalLatitude'], errors='coerce')
+        chunk['decimalLongitude'] = pd.to_numeric(chunk['decimalLongitude'], errors='coerce')
+        chunk['year'] = pd.to_numeric(chunk['year'], errors='coerce')
+
         chunk.rename(columns={"speciesKey": "gbif_id"}, inplace=True)
         updated_data = chunk.merge(map_df, on="gbif_id", how='inner')
         necessary_columns = updated_data[["iucn_taxon_id", "gbif_id", "decimalLatitude", "decimalLongitude", "year"]]
+
         necessary_columns.to_csv(
             output_csv_path,
             mode='w' if first_chunk else 'a',
             header=first_chunk,
             index=False
         )
+        total_rows += len(necessary_columns)
         first_chunk = False
 
+    print(f"Wrote {total_rows} rows to {output_csv_path}")
+    print(f"Skipped {skipped_rows} rows due to invalid/missing data")
+
 def fetch_gbif_data(
     collated_data_path: Path,
     taxa: str,

diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py
@@ -36,7 +36,7 @@ def process_species(
         raise ValueError("Too many taxon IDs")
     taxon_id = taxon_ids[0]
 
-    aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
+    aoh_files = list(aohs_path.glob(f"**/{taxon_id}_*.tif"))
     # We here are aborting on those species with no data or those
     # with multiple seasons
     if len(aoh_files) == 0:
@@ -49,9 +49,11 @@ def process_species(
     with open(aoh_data_path, 'r', encoding='utf-8') as f:
         aoh_data = json.load(f)
 
-    species_data_files = list(species_data_path.glob(f"**/{taxon_id}*.geojson"))
+    species_data_files = list(species_data_path.glob(f"**/{taxon_id}_*.geojson"))
     if len(species_data_files) != 1:
-        raise RuntimeError(f"We expected one JSON file beside the GeoTIFF, we found {len(species_data_files)}")
+        raise RuntimeError(
+            f"We expected one GeoJSON file beside the GeoTIFF, we found {len(species_data_files)} for {taxon_id}"
+        )
     species_range = gpd.read_file(species_data_files[0])
 
     # From Dahal et al: "This ensured that we only included points which fell inside
@@ -148,14 +150,18 @@ def validate_occurrences(
     occurrences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
     occurrences_per_species = [group for _, group in occurrences.groupby('iucn_taxon_id')]
     with Pool(processes=process_count) as pool:
-        results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurrences_per_species)
+        results_per_species = pool.map(partial(
+            process_species_wrapper,
+            aohs_path,
+            species_data_path
+        ), occurrences_per_species)
     cleaned_results = [x for x in results_per_species if x is not None]
 
     summary = pd.DataFrame(cleaned_results)
     summary.to_csv(output_path, index=False)
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Validate map prevalence.")
+    parser = argparse.ArgumentParser(description="Validate occurrence prevelance.")
     parser.add_argument(
         '--gbif_data_path',
         type=Path,

diff --git a/tests/test_occurences.py b/tests/test_occurences.py
@@ -98,7 +98,7 @@ def test_simple_match_in_out_range(
         tmpdir_path = Path(tmpdir)
 
         for test_id in [41, 42, 43]:
-            aoh_path = tmpdir_path / f"{test_id}.tif"
+            aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif"
             generate_faux_aoh(aoh_path)
 
         occurences = generate_occurrence_cluster(latitude, longitude, 20, 2.0)
@@ -116,7 +116,7 @@ def test_simple_match_in_out_range(
     (42, 0.0, 0.0, 1.0, True, False), # all in AoH
     (42, 0.0, 20.0, None, False, None), # all out of range
 ])
-def test_model_prevalence_of_one(
+def test_occurrence_prevalence_of_one(
     taxon_id: int,
     latitude: float,
     longitude: float,
@@ -128,7 +128,7 @@ def test_model_prevalence_of_one(
         tmpdir_path = Path(tmpdir)
 
         for test_id in [41, 42, 43]:
-            aoh_path = tmpdir_path / f"{test_id}.tif"
+            aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif"
             generate_faux_aoh(aoh_path, aoh_radius=5.0, range_radius=5.0)
 
         occurences = generate_occurrence_cluster(latitude, longitude, 20, 2.0)
@@ -150,7 +150,7 @@ def test_no_aoh_found() -> None:
         tmpdir_path = Path(tmpdir)
 
         for test_id in [41, 42, 43]:
-            aoh_path = tmpdir_path / f"{test_id}.tif"
+            aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif"
             generate_faux_aoh(aoh_path)
 
         df = pd.DataFrame(