Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
aoh-endemism --help
aoh-collate-data --help
aoh-validate-prevalence --help
aoh-validate-occurences --help
aoh-fetch-gbif-data --help

- name: Test package imports
Expand Down
9 changes: 8 additions & 1 deletion aoh/validation/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# AoH Validation

This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). This Python implementation cribs heavily from an R implementation by [Franchesca Ridley]().
This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). The model validation implementation cribs heavily from an R implementation by [Franchesca Ridley]().

This directory contains the following scripts:

* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.
20 changes: 9 additions & 11 deletions aoh/validation/collate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import os
import sys
from glob import glob
from pathlib import Path

import pandas as pd

Expand All @@ -26,16 +26,14 @@
]

def collate_data(
aoh_results: str,
output_path: str,
aoh_results: Path,
output_path: Path,
) -> None:
manifests = [os.path.join(aoh_results, fn) for fn in glob("**/*.json", root_dir=aoh_results, recursive=True)]
if not manifests:
print(f"Found no manifests in {aoh_results}", file=sys.stderr)
sys.exit(-1)
manifests = aoh_results.glob("**/*.json")
if len(list(manifests)) == 0:
sys.exit(f"Found no manifests in {aoh_results}")

output_dir, _ = os.path.split(output_path)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_path.parent, exist_ok=True)

res = []
all_keys = set()
Expand All @@ -61,14 +59,14 @@ def main() -> None:
parser = argparse.ArgumentParser(description="Collate metadata from AoH build.")
parser.add_argument(
'--aoh_results',
type=str,
type=Path,
help="Path of all the AoH outputs.",
required=True,
dest="aohs_path"
)
parser.add_argument(
"--output",
type=str,
type=Path,
required=True,
dest="output_path",
help="Destination for collated CSV."
Expand Down
9 changes: 5 additions & 4 deletions aoh/validation/validate_map_prevalence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Based on R code authored by Franchesca Ridley.

import argparse
from pathlib import Path

import pandas as pd

Expand Down Expand Up @@ -64,8 +65,8 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
return pd.concat(per_class_df) # type: ignore[no-any-return]

def validate_map_prevalence(
collated_data_path: str,
output_path: str,
collated_data_path: Path,
output_path: Path,
) -> None:
aoh_df = pd.read_csv(collated_data_path)
outliers = model_validation(aoh_df)
Expand All @@ -75,14 +76,14 @@ def main() -> None:
parser = argparse.ArgumentParser(description="Validate map prevalence.")
parser.add_argument(
'--collated_aoh_data',
type=str,
type=Path,
help="CSV containing collated AoH data",
required=True,
dest="collated_data_path"
)
parser.add_argument(
"--output",
type=str,
type=Path,
required=True,
dest="output_path",
help="CSV of outliers."
Expand Down
105 changes: 105 additions & 0 deletions aoh/validation/validate_occurences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import argparse
import os
from contextlib import ExitStack
from functools import partial
from multiprocessing import cpu_count, Pool
from pathlib import Path

import pandas as pd
import yirgacheffe as yg

def process_species(
aohs_path: Path,
species_occurences: pd.DataFrame,
) -> pd.DataFrame:

if len(species_occurences) == 0:
species_occurences['occurence'] = None
return species_occurences

taxon_ids = species_occurences.iucn_taxon_id.unique()
if len(taxon_ids) > 1:
raise ValueError("Too many taxon IDs")
taxon_id = taxon_ids[0]

aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
if len(aoh_files) == 0:
species_occurences['occurence'] = False
return species_occurences

with ExitStack() as stack:
rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files]
aoh = rasters[0]
for raster in rasters [1:]:
aoh += raster

results = []
for _, row in species_occurences.iterrows():
pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude)
value = aoh.read_array(pixel_x, pixel_y, 1, 1)
results.append(value > 0.0)

species_occurences['occurence'] = results
return species_occurences

def validate_occurences(
gbif_data_path: Path,
aohs_path: Path,
output_path: Path,
process_count: int,
) -> None:
os.makedirs(output_path.parent, exist_ok=True)

# The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
# iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
occurences = pd.read_csv(gbif_data_path)
occurences.drop(columns=['gbif_data', 'year'], inplace=True)
occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
with Pool(processes=process_count) as pool:
results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species)
results = pd.concat(results_per_species)
results.to_csv(output_path)

def main() -> None:
parser = argparse.ArgumentParser(description="Validate map prevalence.")
parser.add_argument(
'--gbif_data_path',
type=Path,
help="Data containing downloaded GBIF data.",
required=True,
dest="gbif_data_path"
)
parser.add_argument(
'--aoh_results',
type=Path,
help="Path of all the AoH outputs.",
required=True,
dest="aohs_path"
)
parser.add_argument(
"--output",
type=Path,
required=True,
dest="output_path",
help="CSV of outliers."
)
parser.add_argument(
"-j",
type=int,
required=False,
default=round(cpu_count() / 2),
dest="processes_count",
help="Optional number of concurrent threads to use."
)
args = parser.parse_args()

validate_occurences(
args.gbif_data_path,
args.aohs_path,
args.output_path,
args.process_count,
)

if __name__ == "__main__":
main()
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "aoh"
version = "1.0.1"
version = "1.1.0"
description = "A library for calculating Area of Habitat for species distribution mapping"
authors = [
{name = "Michael Dales", email = "mwd24@cam.ac.uk"}
Expand All @@ -29,7 +29,7 @@ dependencies = [
"psutil",
"pyproj>=3.4,<4.0",
"scikit-image>=0.20,<1.0",
"yirgacheffe>=1.7.8,<2.0",
"yirgacheffe>=1.9.1,<2.0",
"zenodo_search",
"pandas>=2.0,<3.0",
"gdal[numpy]>=3.8,<3.12",
Expand All @@ -47,6 +47,7 @@ dev = [
"pytest",
"types-psutil",
"types-requests",
"types-shapely",
"pandas-stubs",
"geojson",
"pytest-cov",
Expand All @@ -66,6 +67,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main"
aoh-endemism = "aoh.summaries.endemism:main"
aoh-collate-data = "aoh.validation.collate_data:main"
aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main"
aoh-validate-occurences = "aoh.validation.validate_occurences:main"
aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main"

[tool.setuptools]
Expand Down
10 changes: 5 additions & 5 deletions tests/test_aohcalc.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,11 @@ def generate_species_info(
"full_habitat_code": "|".join(sorted(list(habitat_codes))),
}
coordinates = [[
[-90, -54],
[90, -54],
[90, 54],
[-90, 54],
[-90, -54],
[-90, -45],
[90, -45],
[90, 45],
[-90, 45],
[-90, -45],
]]
polygon = geojson.Polygon(coordinates)
feature= geojson.Feature(geometry=polygon, properties=properties)
Expand Down
131 changes: 131 additions & 0 deletions tests/test_occurences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import json
import tempfile
from pathlib import Path

import pandas as pd
import pytest
import yirgacheffe as yg
from shapely.geometry import mapping, Polygon

from aoh.validation.validate_occurences import process_species

def test_empty_species_list() -> None:
df = pd.DataFrame([], columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'])
res = process_species(Path("/some/aohs"), df)
assert len(res) == 0

def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None:

shapes = [
shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])
]

features = []
for geom in shapes:
feature = {
"type": "Feature",
"properties": {},
"geometry": mapping(geom)
}
features.append(feature)

geojson = {
"type": "FeatureCollection",
"features": features
}

with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)
geojson_path = tmpdir_path / "tmp.geojson"
with open(geojson_path, 'w', encoding="UTF-8") as f:
json.dump(geojson, f, indent=2)

with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape_layer:
shape_layer.to_geotiff(filename)

@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
(42, 5.0, 5.0, True),
(42, 12.0, 12.0, False),
(40, 5.0, 5.0, False),
])
def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)

for test_id in [41, 42, 43]:
aoh_path = tmpdir_path / f"{test_id}.tif"
generate_faux_aoh(aoh_path)

df = pd.DataFrame(
[(taxon_id, latitude, longitude)],
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
)

res = process_species(tmpdir_path, df)

assert len(res) == len(df)
occurence = res.occurence[0]
assert occurence == expected

def test_multiple_match() -> None:
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)

for test_id in [41, 42, 43]:
aoh_path = tmpdir_path / f"{test_id}.tif"
generate_faux_aoh(aoh_path)

df = pd.DataFrame(
[
(42, 5.0, 5.0, True),
(42, 12.0, 12.0, False),
],
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
)

res = process_species(tmpdir_path, df)

assert len(res) == len(df)
assert (res.occurence == res.expected).all()

def test_too_many_ids() -> None:
df = pd.DataFrame(
[
(42, 5.0, 5.0, True),
(42, 12.0, 12.0, False),
(40, 5.0, 5.0, False),
],
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
)

with pytest.raises(ValueError):
_ = process_species(Path("/some/aohs"), df)

@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
(42, 5.0, 5.0, True),
(42, -5.0, -5.0, True),
(42, 5.0, -5.0, False),
(42, -5.0, 5.0, False),
(40, 5.0, 5.0, False),
])
def test_find_seasonal(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)

for season, shape in [
('breeding', Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])),
('nonbreeding', Polygon([(0, 0), (0, -10), (-10, -10), (-10, 0)])),
]:
aoh_path = tmpdir_path / f"42_{season}.tif"
generate_faux_aoh(aoh_path, shape)

df = pd.DataFrame(
[(taxon_id, latitude, longitude)],
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
)

res = process_species(tmpdir_path, df)

assert len(res) == len(df)
occurence = res.occurence[0]
assert occurence == expected