WIP: make new occurrence path more testable

mdales · mdales · commit d80937777f1b · 2025-11-12T08:42:35.000Z
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+.DS_Store
+
 # C extensions
 *.so
 
@@ -163,4 +165,4 @@ cython_debug/
 .vscode/
 
 # Claude Code
-.claude/
+.claude/
diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py
@@ -13,24 +13,24 @@
 def process_species(
     aohs_path: Path,
     species_data_path: Path,
-    species_occurences: pd.DataFrame,
-) -> tuple[int, int, int, float, float, bool] | None:
-
-    os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0"
+    species_occurrences: pd.DataFrame,
+) -> tuple[int, int, int, float, float, bool]:
 
-    if len(species_occurences) == 0:
-        return None
+    if len(species_occurrences) == 0:
+        raise ValueError("No occurrences")
 
-    taxon_ids = species_occurences.iucn_taxon_id.unique()
+    taxon_ids = species_occurrences.iucn_taxon_id.unique()
     if len(taxon_ids) > 1:
         raise ValueError("Too many taxon IDs")
     taxon_id = taxon_ids[0]
 
     aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
     # We here are aborting on those species with no data or those
     # with multiple seasons
-    if len(aoh_files) != 1:
-        return None
+    if len(aoh_files) == 0:
+        raise FileNotFoundError("No AOHs found")
+    if len(aoh_files) > 1:
+        raise NotImplementedError("Multi-season AOHs not yet supported")
 
     aoh_tiff_path = aoh_files[0]
     aoh_data_path = aoh_tiff_path.with_suffix(".json")
@@ -39,17 +39,17 @@ def process_species(
 
     species_data_files = list(species_data_path.glob(f"**/{taxon_id}*.geojson"))
     if len(species_data_files) != 1:
-        return None
+        raise RuntimeError(f"We expected one JSON file beside the GeoTIFF, we found {len(species_data_files)}")
     species_range = gpd.read_file(species_data_files[0])
 
     # From Dahal et al: "This ensured that we only included points which fell inside
     # the boundaries of the selected range maps."
     points_gdf = gpd.GeoDataFrame(
-        species_occurences,
+        species_occurrences,
         geometry=[
             Point(lon, lat)
             for lon, lat in
-            zip(species_occurences['decimalLongitude'], species_occurences['decimalLatitude'])
+            zip(species_occurrences['decimalLongitude'], species_occurrences['decimalLatitude'])
         ],
         crs='EPSG:4326',
     )
@@ -73,7 +73,7 @@ def process_species(
     # From Dahal et al: "Finally, we excluded species which had fewer than 10 point localities after
     # all the filters were applied."
     if len(results) < 10:
-        return None
+        raise ValueError("Not enough occurrences")
 
     matches = len([x for x in results if x])
     point_prevalence = matches / len(results)
@@ -87,7 +87,20 @@ def process_species(
         point_prevalence <= model_prevalence
     )
 
-def validate_occurences(
+def process_species_wrapper(
+    aohs_path: Path,
+    species_data_path: Path,
+    species_occurrences: pd.DataFrame,
+) -> tuple[int, int, int, float, float, bool] | None:
+    # This wrapper exists to make it easier to write unit tests for process species by having it thrown
+    # unique exceptions for each failure, but allowing us to use pool.map to invoke it which won't
+    # tolerate those.
+    try:
+        return process_species(aohs_path, species_data_path, species_occurrences)
+    except (ValueError, FileNotFoundError, RuntimeError, NotImplementedError):
+        return None
+
+def validate_occurrences(
     gbif_data_path: Path,
     aohs_path: Path,
     species_data_path: Path,
@@ -98,12 +111,12 @@ def validate_occurences(
 
     # The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
     # iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
-    occurences = pd.read_csv(gbif_data_path)
-    occurences.drop(columns=['gbif_id', 'year'], inplace=True)
-    occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
-    occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
+    occurrences = pd.read_csv(gbif_data_path)
+    occurrences.drop(columns=['gbif_id', 'year'], inplace=True)
+    occurrences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
+    occurrences_per_species = [group for _, group in occurrences.groupby('iucn_taxon_id')]
     with Pool(processes=process_count) as pool:
-        results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurences_per_species)
+        results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurrences_per_species)
     cleaned_results = [x for x in results_per_species if x is not None]
 
     summary = pd.DataFrame(cleaned_results, columns=[
@@ -157,7 +170,7 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    validate_occurences(
+    validate_occurrences(
         args.gbif_data_path,
         args.aohs_path,
         args.species_data_path,
diff --git a/tests/test_occurences.py b/tests/test_occurences.py
@@ -47,7 +47,6 @@ def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None:
 @pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
     (42, 5.0, 5.0, True),
     (42, 12.0, 12.0, False),
-    (40, 5.0, 5.0, False),
 ])
 def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -61,18 +60,30 @@ def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected
             [(taxon_id, latitude, longitude)],
             columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
         )
-        import os
-        print(os.listdir(tmpdir_path))
         res = process_species(tmpdir_path, tmpdir_path, df)
 
-        assert res is not None
         id_no, results, matches, point_prev, model_prev, outlier = res
         assert id_no == taxon_id
         assert results == 1
         assert matches == (1 if expected else 0)
         assert model_prev == 1.0
         assert outlier == expected
 
+def test_no_aoh_found(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+
+        for test_id in [41, 42, 43]:
+            aoh_path = tmpdir_path / f"{test_id}.tif"
+            generate_faux_aoh(aoh_path)
+
+        df = pd.DataFrame(
+            [(40, 5.0, 5.0)],
+            columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
+        )
+        with pytest.raises(FileNotFoundError):
+            _ = process_species(tmpdir_path, tmpdir_path, df)
+
 def test_multiple_match() -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir_path = Path(tmpdir)