diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 0cca3b5..8f4c4c9 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -57,6 +57,7 @@ jobs: aoh-endemism --help aoh-collate-data --help aoh-validate-prevalence --help + aoh-validate-occurences --help aoh-fetch-gbif-data --help - name: Test package imports diff --git a/aoh/validation/README.md b/aoh/validation/README.md index eb4860b..ef43fcf 100644 --- a/aoh/validation/README.md +++ b/aoh/validation/README.md @@ -1,3 +1,10 @@ # AoH Validation -This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). This Python implementation cribs heavily from an R implementation by [Franchesca Ridley](). \ No newline at end of file +This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). The model validation implementation cribs heavily from an R implementation by [Franchesca Ridley](). + +This directory contains the following scripts: + +* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run. +* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper. +* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper. +* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs. diff --git a/aoh/validation/collate_data.py b/aoh/validation/collate_data.py index e77a2b0..fcce4e4 100644 --- a/aoh/validation/collate_data.py +++ b/aoh/validation/collate_data.py @@ -2,7 +2,7 @@ import json import os import sys -from glob import glob +from pathlib import Path import pandas as pd @@ -26,16 +26,14 @@ ] def collate_data( - aoh_results: str, - output_path: str, + aoh_results: Path, + output_path: Path, ) -> None: - manifests = [os.path.join(aoh_results, fn) for fn in glob("**/*.json", root_dir=aoh_results, recursive=True)] - if not manifests: - print(f"Found no manifests in {aoh_results}", file=sys.stderr) - sys.exit(-1) + manifests = aoh_results.glob("**/*.json") + if len(list(manifests)) == 0: + sys.exit(f"Found no manifests in {aoh_results}") - output_dir, _ = os.path.split(output_path) - os.makedirs(output_dir, exist_ok=True) + os.makedirs(output_path.parent, exist_ok=True) res = [] all_keys = set() @@ -61,14 +59,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Collate metadata from AoH build.") parser.add_argument( '--aoh_results', - type=str, + type=Path, help="Path of all the AoH outputs.", required=True, dest="aohs_path" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_path", help="Destination for collated CSV." diff --git a/aoh/validation/validate_map_prevalence.py b/aoh/validation/validate_map_prevalence.py index 8f799f5..8e761a1 100644 --- a/aoh/validation/validate_map_prevalence.py +++ b/aoh/validation/validate_map_prevalence.py @@ -2,6 +2,7 @@ # Based on R code authored by Franchesca Ridley. import argparse +from pathlib import Path import pandas as pd @@ -64,8 +65,8 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame: return pd.concat(per_class_df) # type: ignore[no-any-return] def validate_map_prevalence( - collated_data_path: str, - output_path: str, + collated_data_path: Path, + output_path: Path, ) -> None: aoh_df = pd.read_csv(collated_data_path) outliers = model_validation(aoh_df) @@ -75,14 +76,14 @@ def main() -> None: parser = argparse.ArgumentParser(description="Validate map prevalence.") parser.add_argument( '--collated_aoh_data', - type=str, + type=Path, help="CSV containing collated AoH data", required=True, dest="collated_data_path" ) parser.add_argument( "--output", - type=str, + type=Path, required=True, dest="output_path", help="CSV of outliers." diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py new file mode 100644 index 0000000..3b3c7fe --- /dev/null +++ b/aoh/validation/validate_occurences.py @@ -0,0 +1,105 @@ +import argparse +import os +from contextlib import ExitStack +from functools import partial +from multiprocessing import cpu_count, Pool +from pathlib import Path + +import pandas as pd +import yirgacheffe as yg + +def process_species( + aohs_path: Path, + species_occurences: pd.DataFrame, +) -> pd.DataFrame: + + if len(species_occurences) == 0: + species_occurences['occurence'] = None + return species_occurences + + taxon_ids = species_occurences.iucn_taxon_id.unique() + if len(taxon_ids) > 1: + raise ValueError("Too many taxon IDs") + taxon_id = taxon_ids[0] + + aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif")) + if len(aoh_files) == 0: + species_occurences['occurence'] = False + return species_occurences + + with ExitStack() as stack: + rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files] + aoh = rasters[0] + for raster in rasters [1:]: + aoh += raster + + results = [] + for _, row in species_occurences.iterrows(): + pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude) + value = aoh.read_array(pixel_x, pixel_y, 1, 1) + results.append(value > 0.0) + + species_occurences['occurence'] = results + return species_occurences + +def validate_occurences( + gbif_data_path: Path, + aohs_path: Path, + output_path: Path, + process_count: int, +) -> None: + os.makedirs(output_path.parent, exist_ok=True) + + # The input is from the points.csv generated by fetch_gbif_data.py, which has the columns: + # iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year + occurences = pd.read_csv(gbif_data_path) + occurences.drop(columns=['gbif_data', 'year'], inplace=True) + occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True) + occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')] + with Pool(processes=process_count) as pool: + results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species) + results = pd.concat(results_per_species) + results.to_csv(output_path) + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate map prevalence.") + parser.add_argument( + '--gbif_data_path', + type=Path, + help="Data containing downloaded GBIF data.", + required=True, + dest="gbif_data_path" + ) + parser.add_argument( + '--aoh_results', + type=Path, + help="Path of all the AoH outputs.", + required=True, + dest="aohs_path" + ) + parser.add_argument( + "--output", + type=Path, + required=True, + dest="output_path", + help="CSV of outliers." + ) + parser.add_argument( + "-j", + type=int, + required=False, + default=round(cpu_count() / 2), + dest="processes_count", + help="Optional number of concurrent threads to use." + ) + args = parser.parse_args() + + validate_occurences( + args.gbif_data_path, + args.aohs_path, + args.output_path, + args.process_count, + ) + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index db96cf7..31bf0ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "aoh" -version = "1.0.1" +version = "1.1.0" description = "A library for calculating Area of Habitat for species distribution mapping" authors = [ {name = "Michael Dales", email = "mwd24@cam.ac.uk"} @@ -29,7 +29,7 @@ dependencies = [ "psutil", "pyproj>=3.4,<4.0", "scikit-image>=0.20,<1.0", - "yirgacheffe>=1.7.8,<2.0", + "yirgacheffe>=1.9.1,<2.0", "zenodo_search", "pandas>=2.0,<3.0", "gdal[numpy]>=3.8,<3.12", @@ -47,6 +47,7 @@ dev = [ "pytest", "types-psutil", "types-requests", + "types-shapely", "pandas-stubs", "geojson", "pytest-cov", @@ -66,6 +67,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main" aoh-endemism = "aoh.summaries.endemism:main" aoh-collate-data = "aoh.validation.collate_data:main" aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main" +aoh-validate-occurences = "aoh.validation.validate_occurences:main" aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main" [tool.setuptools] diff --git a/tests/test_aohcalc.py b/tests/test_aohcalc.py index 6b387ca..cbf2d22 100644 --- a/tests/test_aohcalc.py +++ b/tests/test_aohcalc.py @@ -103,11 +103,11 @@ def generate_species_info( "full_habitat_code": "|".join(sorted(list(habitat_codes))), } coordinates = [[ - [-90, -54], - [90, -54], - [90, 54], - [-90, 54], - [-90, -54], + [-90, -45], + [90, -45], + [90, 45], + [-90, 45], + [-90, -45], ]] polygon = geojson.Polygon(coordinates) feature= geojson.Feature(geometry=polygon, properties=properties) diff --git a/tests/test_occurences.py b/tests/test_occurences.py new file mode 100644 index 0000000..6cd6ea3 --- /dev/null +++ b/tests/test_occurences.py @@ -0,0 +1,131 @@ +import json +import tempfile +from pathlib import Path + +import pandas as pd +import pytest +import yirgacheffe as yg +from shapely.geometry import mapping, Polygon + +from aoh.validation.validate_occurences import process_species + +def test_empty_species_list() -> None: + df = pd.DataFrame([], columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']) + res = process_species(Path("/some/aohs"), df) + assert len(res) == 0 + +def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None: + + shapes = [ + shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)]) + ] + + features = [] + for geom in shapes: + feature = { + "type": "Feature", + "properties": {}, + "geometry": mapping(geom) + } + features.append(feature) + + geojson = { + "type": "FeatureCollection", + "features": features + } + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + geojson_path = tmpdir_path / "tmp.geojson" + with open(geojson_path, 'w', encoding="UTF-8") as f: + json.dump(geojson, f, indent=2) + + with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape_layer: + shape_layer.to_geotiff(filename) + +@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + (40, 5.0, 5.0, False), +]) +def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for test_id in [41, 42, 43]: + aoh_path = tmpdir_path / f"{test_id}.tif" + generate_faux_aoh(aoh_path) + + df = pd.DataFrame( + [(taxon_id, latitude, longitude)], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + occurence = res.occurence[0] + assert occurence == expected + +def test_multiple_match() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for test_id in [41, 42, 43]: + aoh_path = tmpdir_path / f"{test_id}.tif" + generate_faux_aoh(aoh_path) + + df = pd.DataFrame( + [ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + ], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + assert (res.occurence == res.expected).all() + +def test_too_many_ids() -> None: + df = pd.DataFrame( + [ + (42, 5.0, 5.0, True), + (42, 12.0, 12.0, False), + (40, 5.0, 5.0, False), + ], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected'] + ) + + with pytest.raises(ValueError): + _ = process_species(Path("/some/aohs"), df) + +@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[ + (42, 5.0, 5.0, True), + (42, -5.0, -5.0, True), + (42, 5.0, -5.0, False), + (42, -5.0, 5.0, False), + (40, 5.0, 5.0, False), +]) +def test_find_seasonal(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + for season, shape in [ + ('breeding', Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])), + ('nonbreeding', Polygon([(0, 0), (0, -10), (-10, -10), (-10, 0)])), + ]: + aoh_path = tmpdir_path / f"42_{season}.tif" + generate_faux_aoh(aoh_path, shape) + + df = pd.DataFrame( + [(taxon_id, latitude, longitude)], + columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'] + ) + + res = process_species(tmpdir_path, df) + + assert len(res) == len(df) + occurence = res.occurence[0] + assert occurence == expected