Merge pull request #27 from quantifyearth/mwd-point-validation

mdales · web-flow · commit 6dd0c7309496 · 2025-10-01T11:01:32.000+01:00
Add point validation using occurrence data
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -57,6 +57,7 @@ jobs:
           aoh-endemism --help
           aoh-collate-data --help
           aoh-validate-prevalence --help
+          aoh-validate-occurences --help
           aoh-fetch-gbif-data --help
 
       - name: Test package imports
diff --git a/aoh/validation/README.md b/aoh/validation/README.md
@@ -1,3 +1,10 @@
 # AoH Validation
 
-This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). This Python implementation cribs heavily from an R implementation by [Franchesca Ridley]().
+This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). The model validation implementation cribs heavily from an R implementation by [Franchesca Ridley]().
+
+This directory contains the following scripts:
+
+* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
+* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
+* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
+* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.
diff --git a/aoh/validation/collate_data.py b/aoh/validation/collate_data.py
@@ -2,7 +2,7 @@
 import json
 import os
 import sys
-from glob import glob
+from pathlib import Path
 
 import pandas as pd
 
@@ -26,16 +26,14 @@
 ]
 
 def collate_data(
-    aoh_results: str,
-    output_path: str,
+    aoh_results: Path,
+    output_path: Path,
 ) -> None:
-    manifests = [os.path.join(aoh_results, fn) for fn in glob("**/*.json", root_dir=aoh_results, recursive=True)]
-    if not manifests:
-        print(f"Found no manifests in {aoh_results}", file=sys.stderr)
-        sys.exit(-1)
+    manifests = aoh_results.glob("**/*.json")
+    if len(list(manifests)) == 0:
+        sys.exit(f"Found no manifests in {aoh_results}")
 
-    output_dir, _ = os.path.split(output_path)
-    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(output_path.parent, exist_ok=True)
 
     res = []
     all_keys = set()
@@ -61,14 +59,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Collate metadata from AoH build.")
     parser.add_argument(
         '--aoh_results',
-        type=str,
+        type=Path,
         help="Path of all the AoH outputs.",
         required=True,
         dest="aohs_path"
     )
     parser.add_argument(
         "--output",
-        type=str,
+        type=Path,
         required=True,
         dest="output_path",
         help="Destination for collated CSV."
diff --git a/aoh/validation/validate_map_prevalence.py b/aoh/validation/validate_map_prevalence.py
@@ -2,6 +2,7 @@
 # Based on R code authored by Franchesca Ridley.
 
 import argparse
+from pathlib import Path
 
 import pandas as pd
 
@@ -64,8 +65,8 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
     return pd.concat(per_class_df)  # type: ignore[no-any-return]
 
 def validate_map_prevalence(
-    collated_data_path: str,
-    output_path: str,
+    collated_data_path: Path,
+    output_path: Path,
 ) -> None:
     aoh_df = pd.read_csv(collated_data_path)
     outliers = model_validation(aoh_df)
@@ -75,14 +76,14 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Validate map prevalence.")
     parser.add_argument(
         '--collated_aoh_data',
-        type=str,
+        type=Path,
         help="CSV containing collated AoH data",
         required=True,
         dest="collated_data_path"
     )
     parser.add_argument(
         "--output",
-        type=str,
+        type=Path,
         required=True,
         dest="output_path",
         help="CSV of outliers."
diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py
@@ -0,0 +1,105 @@
+import argparse
+import os
+from contextlib import ExitStack
+from functools import partial
+from multiprocessing import cpu_count, Pool
+from pathlib import Path
+
+import pandas as pd
+import yirgacheffe as yg
+
+def process_species(
+    aohs_path: Path,
+    species_occurences: pd.DataFrame,
+) -> pd.DataFrame:
+
+    if len(species_occurences) == 0:
+        species_occurences['occurence'] = None
+        return species_occurences
+
+    taxon_ids = species_occurences.iucn_taxon_id.unique()
+    if len(taxon_ids) > 1:
+        raise ValueError("Too many taxon IDs")
+    taxon_id = taxon_ids[0]
+
+    aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
+    if len(aoh_files) == 0:
+        species_occurences['occurence'] = False
+        return species_occurences
+
+    with ExitStack() as stack:
+        rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files]
+        aoh = rasters[0]
+        for raster in rasters [1:]:
+            aoh += raster
+
+        results = []
+        for _, row in species_occurences.iterrows():
+            pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude)
+            value = aoh.read_array(pixel_x, pixel_y, 1, 1)
+            results.append(value > 0.0)
+
+    species_occurences['occurence'] = results
+    return species_occurences
+
+def validate_occurences(
+    gbif_data_path: Path,
+    aohs_path: Path,
+    output_path: Path,
+    process_count: int,
+) -> None:
+    os.makedirs(output_path.parent, exist_ok=True)
+
+    # The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
+    # iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
+    occurences = pd.read_csv(gbif_data_path)
+    occurences.drop(columns=['gbif_data', 'year'], inplace=True)
+    occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
+    occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
+    with Pool(processes=process_count) as pool:
+        results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species)
+    results = pd.concat(results_per_species)
+    results.to_csv(output_path)
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Validate map prevalence.")
+    parser.add_argument(
+        '--gbif_data_path',
+        type=Path,
+        help="Data containing downloaded GBIF data.",
+        required=True,
+        dest="gbif_data_path"
+    )
+    parser.add_argument(
+        '--aoh_results',
+        type=Path,
+        help="Path of all the AoH outputs.",
+        required=True,
+        dest="aohs_path"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        dest="output_path",
+        help="CSV of outliers."
+    )
+    parser.add_argument(
+        "-j",
+        type=int,
+        required=False,
+        default=round(cpu_count() / 2),
+        dest="processes_count",
+        help="Optional number of concurrent threads to use."
+    )
+    args = parser.parse_args()
+
+    validate_occurences(
+        args.gbif_data_path,
+        args.aohs_path,
+        args.output_path,
+        args.process_count,
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "aoh"
-version = "1.0.1"
+version = "1.1.0"
 description = "A library for calculating Area of Habitat for species distribution mapping"
 authors = [
     {name = "Michael Dales", email = "mwd24@cam.ac.uk"}
@@ -29,7 +29,7 @@ dependencies = [
     "psutil",
     "pyproj>=3.4,<4.0",
     "scikit-image>=0.20,<1.0",
-    "yirgacheffe>=1.7.8,<2.0",
+    "yirgacheffe>=1.9.1,<2.0",
     "zenodo_search",
     "pandas>=2.0,<3.0",
     "gdal[numpy]>=3.8,<3.12",
@@ -47,6 +47,7 @@ dev = [
     "pytest",
     "types-psutil",
     "types-requests",
+    "types-shapely",
     "pandas-stubs",
     "geojson",
     "pytest-cov",
@@ -66,6 +67,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main"
 aoh-endemism = "aoh.summaries.endemism:main"
 aoh-collate-data = "aoh.validation.collate_data:main"
 aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main"
+aoh-validate-occurences = "aoh.validation.validate_occurences:main"
 aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main"
 
 [tool.setuptools]
diff --git a/tests/test_aohcalc.py b/tests/test_aohcalc.py
@@ -103,11 +103,11 @@ def generate_species_info(
         "full_habitat_code": "|".join(sorted(list(habitat_codes))),
     }
     coordinates = [[
-        [-90, -54],
-        [90, -54],
-        [90, 54],
-        [-90, 54],
-        [-90, -54],
+        [-90, -45],
+        [90, -45],
+        [90, 45],
+        [-90, 45],
+        [-90, -45],
     ]]
     polygon = geojson.Polygon(coordinates)
     feature= geojson.Feature(geometry=polygon, properties=properties)
diff --git a/tests/test_occurences.py b/tests/test_occurences.py
@@ -0,0 +1,131 @@
+import json
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+import pytest
+import yirgacheffe as yg
+from shapely.geometry import mapping, Polygon
+
+from aoh.validation.validate_occurences import process_species
+
+def test_empty_species_list() -> None:
+    df = pd.DataFrame([], columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'])
+    res = process_species(Path("/some/aohs"), df)
+    assert len(res) == 0
+
+def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None:
+
+    shapes = [
+        shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])
+    ]
+
+    features = []
+    for geom in shapes:
+        feature = {
+            "type": "Feature",
+            "properties": {},
+            "geometry": mapping(geom)
+        }
+        features.append(feature)
+
+    geojson = {
+        "type": "FeatureCollection",
+        "features": features
+    }
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        geojson_path = tmpdir_path / "tmp.geojson"
+        with open(geojson_path, 'w', encoding="UTF-8") as f:
+            json.dump(geojson, f, indent=2)
+
+        with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape_layer:
+            shape_layer.to_geotiff(filename)
+
+@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
+    (42, 5.0, 5.0, True),
+    (42, 12.0, 12.0, False),
+    (40, 5.0, 5.0, False),
+])
+def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+
+        for test_id in [41, 42, 43]:
+            aoh_path = tmpdir_path / f"{test_id}.tif"
+            generate_faux_aoh(aoh_path)
+
+        df = pd.DataFrame(
+            [(taxon_id, latitude, longitude)],
+            columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
+        )
+
+        res = process_species(tmpdir_path, df)
+
+        assert len(res) == len(df)
+        occurence = res.occurence[0]
+        assert occurence == expected
+
+def test_multiple_match() -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+
+        for test_id in [41, 42, 43]:
+            aoh_path = tmpdir_path / f"{test_id}.tif"
+            generate_faux_aoh(aoh_path)
+
+        df = pd.DataFrame(
+            [
+                (42, 5.0, 5.0, True),
+                (42, 12.0, 12.0, False),
+            ],
+            columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
+        )
+
+        res = process_species(tmpdir_path, df)
+
+        assert len(res) == len(df)
+        assert (res.occurence == res.expected).all()
+
+def test_too_many_ids() -> None:
+    df = pd.DataFrame(
+        [
+            (42, 5.0, 5.0, True),
+            (42, 12.0, 12.0, False),
+            (40, 5.0, 5.0, False),
+        ],
+        columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
+    )
+
+    with pytest.raises(ValueError):
+        _ = process_species(Path("/some/aohs"), df)
+
+@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
+    (42, 5.0, 5.0, True),
+    (42, -5.0, -5.0, True),
+    (42, 5.0, -5.0, False),
+    (42, -5.0, 5.0, False),
+    (40, 5.0, 5.0, False),
+])
+def test_find_seasonal(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+
+        for season, shape in [
+            ('breeding', Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])),
+            ('nonbreeding', Polygon([(0, 0), (0, -10), (-10, -10), (-10, 0)])),
+        ]:
+            aoh_path = tmpdir_path / f"42_{season}.tif"
+            generate_faux_aoh(aoh_path, shape)
+
+        df = pd.DataFrame(
+            [(taxon_id, latitude, longitude)],
+            columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
+        )
+
+        res = process_species(tmpdir_path, df)
+
+        assert len(res) == len(df)
+        occurence = res.occurence[0]
+        assert occurence == expected