From 58039fe603635dff87d55941959fc0be94070630 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Thu, 25 Sep 2025 14:38:03 +0100 Subject: [PATCH 1/8] Add bits to readme for validation --- aoh/validation/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/aoh/validation/README.md b/aoh/validation/README.md index eb4860b..9922092 100644 --- a/aoh/validation/README.md +++ b/aoh/validation/README.md @@ -1,3 +1,9 @@ # AoH Validation -This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). This Python implementation cribs heavily from an R implementation by [Franchesca Ridley](). \ No newline at end of file +This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). The model validation implementation cribs heavily from an R implementation by [Franchesca Ridley](). + +This directory contains the following scripts: + +* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run. +* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper. +* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper. From 092d5e7bffdc8f94ae430d35e3415aa67de904a8 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 26 Sep 2025 07:35:18 +0100 Subject: [PATCH 2/8] Sprinkle in more modern python --- aoh/validation/collate_data.py | 20 +++++++++----------- aoh/validation/validate_map_prevalence.py | 9 +++++---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/aoh/validation/collate_data.py b/aoh/validation/collate_data.py index e77a2b0..952c09a 100644 --- a/aoh/validation/collate_data.py +++ b/aoh/validation/collate_data.py @@ -2,7 +2,7 @@ import json import os import sys -from glob import glob +from pathlib import Path import pandas as pd @@ -26,16 +26,14 @@ ] def collate_data( - aoh_results: str, - output_path: str, + aoh_results: Path, + output_path: Path, ) -> None: - manifests = [os.path.join(aoh_results, fn) for fn in glob("**/*.json", root_dir=aoh_results, recursive=True)] - if not manifests: - print(f"Found no manifests in {aoh_results}", file=sys.stderr) - sys.exit(-1) + manifests = aoh_results.glob("**/*.json") + if len(list(manifests)): + sys.exit(f"Found no manifests in {aoh_results}") - output_dir, _ = os.path.split(output_path) - os.makedirs(output_dir, exist_ok=True) + os.makedirs(output_path.parent, exist_ok=True) res = [] all_keys = set() @@ -61,14 +59,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Collate metadata from AoH build.") parser.add_argument( '--aoh_results', - type=str, + type=Path, help="Path of all the AoH outputs.", required=True, dest="aohs_path" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_path", help="Destination for collated CSV." diff --git a/aoh/validation/validate_map_prevalence.py b/aoh/validation/validate_map_prevalence.py index 8f799f5..8e761a1 100644 --- a/aoh/validation/validate_map_prevalence.py +++ b/aoh/validation/validate_map_prevalence.py @@ -2,6 +2,7 @@ # Based on R code authored by Franchesca Ridley. import argparse +from pathlib import Path import pandas as pd @@ -64,8 +65,8 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame: return pd.concat(per_class_df) # type: ignore[no-any-return] def validate_map_prevalence( - collated_data_path: str, - output_path: str, + collated_data_path: Path, + output_path: Path, ) -> None: aoh_df = pd.read_csv(collated_data_path) outliers = model_validation(aoh_df) @@ -75,14 +76,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Validate map prevalence.") parser.add_argument( '--collated_aoh_data', - type=str, + type=Path, help="CSV containing collated AoH data", required=True, dest="collated_data_path" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_path", help="CSV of outliers." From 2e5a07eedef6177dd24f63604b92873019b79153 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Fri, 26 Sep 2025 07:35:35 +0100 Subject: [PATCH 3/8] Start of point validation --- aoh/validation/validate_occurences.py | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 aoh/validation/validate_occurences.py diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py new file mode 100644 index 0000000..3a50a51 --- /dev/null +++ b/aoh/validation/validate_occurences.py @@ -0,0 +1,44 @@ +import argparse +import os +from pathlib import Path + +def validate_occurences( + gbif_data_path: Path, + aohs_path: Path, + output_path: Path, +) -> None: + os.makedirs(output_path.parent, exist_ok=True) + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate map prevalence.") + parser.add_argument( + '--gbif_data_path', + type=Path, + help="Data containing downloaded GBIF data.", + required=True, + dest="gbif_data_path" + ) + parser.add_argument( + '--aoh_results', + type=Path, + help="Path of all the AoH outputs.", + required=True, + dest="aohs_path" + ) + parser.add_argument( + "--output", + type=Path, + required=True, + dest="output_path", + help="CSV of outliers." + ) + args = parser.parse_args() + + validate_occurences( + args.gbif_data_path, + args.aohs_path, + args.output_path, + ) + +if __name__ == "__main__": + main() From b71a7e0a33946cab479c560d21dbb8a189fa11d9 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Tue, 30 Sep 2025 16:08:23 +0100 Subject: [PATCH 4/8] More WIP --- aoh/validation/validate_occurences.py | 38 +++++++++++++++++++++++++++ tests/test_aohcalc.py | 10 +++---- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py index 3a50a51..f6b3537 100644 --- a/aoh/validation/validate_occurences.py +++ b/aoh/validation/validate_occurences.py @@ -1,14 +1,43 @@ import argparse import os +from functools import partial +from multiprocessing import cpu_count, Pool from pathlib import Path +import pandas as pd +import yirgacheffe as yg + +def validate_occurence( + gbif_datum: tuple[int, float, float], + aohs_path: Path, +) -> tuple[int, float, float, bool]: + taxon_id, lat, lng = gbif_datum + +def process_species( + aohs_path: Path, + species_occurences: pd.DataFrame, +) -> pd.DataFrame: + pass + def validate_occurences( gbif_data_path: Path, aohs_path: Path, output_path: Path, + process_count: int, ) -> None: os.makedirs(output_path.parent, exist_ok=True) + # The input is from the points.csv generated by fetch_gbif_data.py, which has the columns: + # iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year + occurences = pd.read_csv(gbif_data_path) + occurences.drop(columns=['gbif_data', 'year'], inplace=True) + occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True) + occurences_per_species = [group for _, group in df.groupby('iucn_taxon_id')] + with Pool(processes=process_count) as pool: + results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species) + results = pd.concat(results_per_species) + results.to_csv(output_path) + def main() -> None: parser = argparse.ArgumentParser(description="Validate map prevalence.") parser.add_argument( @@ -32,12 +61,21 @@ def main() -> None: dest="output_path", help="CSV of outliers." ) + parser.add_argument( + "-j", + type=int, + required=False, + default=round(cpu_count() / 2), + dest="processes_count", + help="Optional number of concurrent threads to use." + ) args = parser.parse_args() validate_occurences( args.gbif_data_path, args.aohs_path, args.output_path, + args.process_count, ) if __name__ == "__main__": diff --git a/tests/test_aohcalc.py b/tests/test_aohcalc.py index 6b387ca..cbf2d22 100644 --- a/tests/test_aohcalc.py +++ b/tests/test_aohcalc.py @@ -103,11 +103,11 @@ def generate_species_info( "full_habitat_code": "|".join(sorted(list(habitat_codes))), } coordinates = [[ - [-90, -54], - [90, -54], - [90, 54], - [-90, 54], - [-90, -54], + [-90, -45], + [90, -45], + [90, 45], + [-90, 45], + [-90, -45], ]] polygon = geojson.Polygon(coordinates) feature= geojson.Feature(geometry=polygon, properties=properties) From 25a4a67aa6a6665ccadd29f78be3173d8c463f5c Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Wed, 1 Oct 2025 08:38:48 +0100 Subject: [PATCH 5/8] WIP: untested occurence --- aoh/validation/validate_occurences.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py index f6b3537..0ab8c1e 100644 --- a/aoh/validation/validate_occurences.py +++ b/aoh/validation/validate_occurences.py @@ -1,5 +1,6 @@ import argparse import os +from contextlib import ExitStack from functools import partial from multiprocessing import cpu_count, Pool from pathlib import Path @@ -13,6 +14,22 @@ def validate_occurence( ) -> tuple[int, float, float, bool]: taxon_id, lat, lng = gbif_datum + aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif")) + if len(aoh_files) == 0: + return (taxon_id, lat, lng, False) + + with ExitStack() as stack: + rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files] + aoh = rasters[0] + for raster in rasters [1:]: + aoh += raster + + pixel_x, pixel_y = raster.pixel_for_latlng(lat, lng) + value = aoh.read_array(pixel_x, pixel_y, 1, 1) + + return (taxon_id, lat, lng, value > 0.0) + + def process_species( aohs_path: Path, species_occurences: pd.DataFrame, @@ -32,7 +49,7 @@ def validate_occurences( occurences = pd.read_csv(gbif_data_path) occurences.drop(columns=['gbif_data', 'year'], inplace=True) occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True) - occurences_per_species = [group for _, group in df.groupby('iucn_taxon_id')] + occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')] with Pool(processes=process_count) as pool: results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species) results = pd.concat(results_per_species) From 57ba6372d0f903ba69415112d2d5dd687f4df0b3 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Wed, 1 Oct 2025 10:37:28 +0100 Subject: [PATCH 6/8] simple version of occurrence validation. --- aoh/validation/validate_occurences.py | 36 ++++--- tests/test_occurences.py | 129 ++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 15 deletions(-) create mode 100644 tests/test_occurences.py diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py index 0ab8c1e..3b3c7fe 100644 --- a/aoh/validation/validate_occurences.py +++ b/aoh/validation/validate_occurences.py @@ -8,15 +8,24 @@ import pandas as pd import yirgacheffe as yg -def validate_occurence( - gbif_datum: tuple[int, float, float], +def process_species( aohs_path: Path, -) -> tuple[int, float, float, bool]: - taxon_id, lat, lng = gbif_datum + species_occurences: pd.DataFrame, +) -> pd.DataFrame: + + if len(species_occurences) == 0: + species_occurences['occurence'] = None + return species_occurences + + taxon_ids = species_occurences.iucn_taxon_id.unique() + if len(taxon_ids) > 1: + raise ValueError("Too many taxon IDs") + taxon_id = taxon_ids[0] aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif")) if len(aoh_files) == 0: - return (taxon_id, lat, lng, False) + species_occurences['occurence'] = False + return species_occurences with ExitStack() as stack: rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files] @@ -24,17 +33,14 @@ def validate_occurence( for raster in rasters [1:]: aoh += raster - pixel_x, pixel_y = raster.pixel_for_latlng(lat, lng) - value = aoh.read_array(pixel_x, pixel_y, 1, 1) - - return (taxon_id, lat, lng, value > 0.0) + results = [] + for _, row in species_occurences.iterrows(): + pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude) + value = aoh.read_array(pixel_x, pixel_y, 1, 1) + results.append(value > 0.0) - -def process_species( - aohs_path: Path, - species_occurences: pd.DataFrame, -) -> pd.DataFrame: - pass + species_occurences['occurence'] = results + return species_occurences def validate_occurences( gbif_data_path: Path, diff --git a/tests/test_occurences.py b/tests/test_occurences.py new file mode 100644 index 0000000..c6a2415 --- /dev/null +++ b/tests/test_occurences.py @@ -0,0 +1,129 @@ +import json +import tempfile +from pathlib import Path + +import pandas as pd +import pytest +import yirgacheffe as yg +from shapely.geometry import mapping, Polygon + +from aoh.validation.validate_occurences import process_species + +def test_empty_species_list() -> None: + df = pd.DataFrame([], columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']) + res = process_species(Path("/some/aohs"), df) + assert len(res) == 0 + +def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None: + + shapes = {'area': shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])} + + features = [] + for name, geom in shapes.items(): + feature = { + "type": "Feature", + "properties": {}, + "geometry": mapping(geom) + } + features.append(feature) + + geojson = { + "type": "FeatureCollection", + "features": features + } + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + geojson_path = tmpdir_path / "tmp.geojson" + with open(geojson_path, 'w', encoding="UTF-8") as f: + json.dump(geojson, f, indent=2) + + with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape: + shape.to_geotiff(filename) + +@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + (40, 5.0, 5.0, False), +]) +def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for test_id in [41, 42, 43]: + aoh_path = tmpdir_path / f"{test_id}.tif" + generate_faux_aoh(aoh_path) + + df = pd.DataFrame( + [(taxon_id, latitude, longitude)], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + occurence = res.occurence[0] + assert occurence == expected + +def test_multiple_match() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for test_id in [41, 42, 43]: + aoh_path = tmpdir_path / f"{test_id}.tif" + generate_faux_aoh(aoh_path) + + df = pd.DataFrame( + [ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + ], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + assert (res.occurence == res.expected).all() + +def test_too_many_ids() -> None: + df = pd.DataFrame( + [ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + (40, 5.0, 5.0, False), + ], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected'] + ) + + with pytest.raises(ValueError): + _ = process_species(Path("/some/aohs"), df) + +@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[ + (42, 5.0, 5.0, True), + (42, -5.0, -5.0, True), + (42, 5.0, -5.0, False), + (42, -5.0, 5.0, False), + (40, 5.0, 5.0, False), +]) +def test_find_seasonal(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for season, shape in [ + ('breeding', Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])), + ('nonbreeding', Polygon([(0, 0), (0, -10), (-10, -10), (-10, 0)])), + ]: + aoh_path = tmpdir_path / f"42_{season}.tif" + generate_faux_aoh(aoh_path, shape) + + df = pd.DataFrame( + [(taxon_id, latitude, longitude)], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + occurence = res.occurence[0] + assert occurence == expected From 84ece4639249e52186eaef4f8b60a3e9ee24f493 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Wed, 1 Oct 2025 10:43:19 +0100 Subject: [PATCH 7/8] linting, typing, and metadata updates --- .github/workflows/pull-request.yml | 1 + aoh/validation/collate_data.py | 2 +- pyproject.toml | 6 ++++-- tests/test_occurences.py | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 0cca3b5..8f4c4c9 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -57,6 +57,7 @@ jobs: aoh-endemism --help aoh-collate-data --help aoh-validate-prevalence --help + aoh-validate-occurences --help aoh-fetch-gbif-data --help - name: Test package imports diff --git a/aoh/validation/collate_data.py b/aoh/validation/collate_data.py index 952c09a..fcce4e4 100644 --- a/aoh/validation/collate_data.py +++ b/aoh/validation/collate_data.py @@ -30,7 +30,7 @@ def collate_data( output_path: Path, ) -> None: manifests = aoh_results.glob("**/*.json") - if len(list(manifests)): + if len(list(manifests)) == 0: sys.exit(f"Found no manifests in {aoh_results}") os.makedirs(output_path.parent, exist_ok=True) diff --git a/pyproject.toml b/pyproject.toml index db96cf7..31bf0ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "aoh" -version = "1.0.1" +version = "1.1.0" description = "A library for calculating Area of Habitat for species distribution mapping" authors = [ {name = "Michael Dales", email = "mwd24@cam.ac.uk"} @@ -29,7 +29,7 @@ dependencies = [ "psutil", "pyproj>=3.4,<4.0", "scikit-image>=0.20,<1.0", - "yirgacheffe>=1.7.8,<2.0", + "yirgacheffe>=1.9.1,<2.0", "zenodo_search", "pandas>=2.0,<3.0", "gdal[numpy]>=3.8,<3.12", @@ -47,6 +47,7 @@ dev = [ "pytest", "types-psutil", "types-requests", + "types-shapely", "pandas-stubs", "geojson", "pytest-cov", @@ -66,6 +67,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main" aoh-endemism = "aoh.summaries.endemism:main" aoh-collate-data = "aoh.validation.collate_data:main" aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main" +aoh-validate-occurences = "aoh.validation.validate_occurences:main" aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main" [tool.setuptools] diff --git a/tests/test_occurences.py b/tests/test_occurences.py index c6a2415..786cd18 100644 --- a/tests/test_occurences.py +++ b/tests/test_occurences.py @@ -38,8 +38,8 @@ def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None: with open(geojson_path, 'w', encoding="UTF-8") as f: json.dump(geojson, f, indent=2) - with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape: - shape.to_geotiff(filename) + with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape_layer: + shape_layer.to_geotiff(filename) @pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[ (42, 5.0, 5.0, True), From 7ce3ef91e169592dac0fc159f5632d24cd630a29 Mon Sep 17 00:00:00 2001 From: Michael Dales Date: Wed, 1 Oct 2025 10:53:20 +0100 Subject: [PATCH 8/8] Fix pylint on test --- aoh/validation/README.md | 1 + tests/test_occurences.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/aoh/validation/README.md b/aoh/validation/README.md index 9922092..ef43fcf 100644 --- a/aoh/validation/README.md +++ b/aoh/validation/README.md @@ -7,3 +7,4 @@ This directory contains the following scripts: * `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run. * `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper. * `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper. +* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs. diff --git a/tests/test_occurences.py b/tests/test_occurences.py index 786cd18..6cd6ea3 100644 --- a/tests/test_occurences.py +++ b/tests/test_occurences.py @@ -16,10 +16,12 @@ def test_empty_species_list() -> None: def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None: - shapes = {'area': shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])} + shapes = [ + shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)]) + ] features = [] - for name, geom in shapes.items(): + for geom in shapes: feature = { "type": "Feature", "properties": {},