Skip to content

Commit 6dd0c73

Browse files
authored
Merge pull request #27 from quantifyearth/mwd-point-validation
Add point validation using occurrence data
2 parents 4dcfbff + 7ce3ef9 commit 6dd0c73

File tree

8 files changed

+268
-23
lines changed

8 files changed

+268
-23
lines changed

.github/workflows/pull-request.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ jobs:
5757
aoh-endemism --help
5858
aoh-collate-data --help
5959
aoh-validate-prevalence --help
60+
aoh-validate-occurences --help
6061
aoh-fetch-gbif-data --help
6162
6263
- name: Test package imports

aoh/validation/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
11
# AoH Validation
22

3-
This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). This Python implementation cribs heavily from an R implementation by [Franchesca Ridley]().
3+
This directory contains code to implement the model base validation proposed by [Dahal et al](https://gmd.copernicus.org/articles/15/5093/2022/). The model validation implementation cribs heavily from an R implementation by [Franchesca Ridley]().
4+
5+
This directory contains the following scripts:
6+
7+
* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
8+
* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
9+
* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
10+
* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.

aoh/validation/collate_data.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
import os
44
import sys
5-
from glob import glob
5+
from pathlib import Path
66

77
import pandas as pd
88

@@ -26,16 +26,14 @@
2626
]
2727

2828
def collate_data(
29-
aoh_results: str,
30-
output_path: str,
29+
aoh_results: Path,
30+
output_path: Path,
3131
) -> None:
32-
manifests = [os.path.join(aoh_results, fn) for fn in glob("**/*.json", root_dir=aoh_results, recursive=True)]
33-
if not manifests:
34-
print(f"Found no manifests in {aoh_results}", file=sys.stderr)
35-
sys.exit(-1)
32+
manifests = aoh_results.glob("**/*.json")
33+
if len(list(manifests)) == 0:
34+
sys.exit(f"Found no manifests in {aoh_results}")
3635

37-
output_dir, _ = os.path.split(output_path)
38-
os.makedirs(output_dir, exist_ok=True)
36+
os.makedirs(output_path.parent, exist_ok=True)
3937

4038
res = []
4139
all_keys = set()
@@ -61,14 +59,14 @@ def main() -> None:
6159
parser = argparse.ArgumentParser(description="Collate metadata from AoH build.")
6260
parser.add_argument(
6361
'--aoh_results',
64-
type=str,
62+
type=Path,
6563
help="Path of all the AoH outputs.",
6664
required=True,
6765
dest="aohs_path"
6866
)
6967
parser.add_argument(
7068
"--output",
71-
type=str,
69+
type=Path,
7270
required=True,
7371
dest="output_path",
7472
help="Destination for collated CSV."

aoh/validation/validate_map_prevalence.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Based on R code authored by Franchesca Ridley.
33

44
import argparse
5+
from pathlib import Path
56

67
import pandas as pd
78

@@ -64,8 +65,8 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
6465
return pd.concat(per_class_df) # type: ignore[no-any-return]
6566

6667
def validate_map_prevalence(
67-
collated_data_path: str,
68-
output_path: str,
68+
collated_data_path: Path,
69+
output_path: Path,
6970
) -> None:
7071
aoh_df = pd.read_csv(collated_data_path)
7172
outliers = model_validation(aoh_df)
@@ -75,14 +76,14 @@ def main() -> None:
7576
parser = argparse.ArgumentParser(description="Validate map prevalence.")
7677
parser.add_argument(
7778
'--collated_aoh_data',
78-
type=str,
79+
type=Path,
7980
help="CSV containing collated AoH data",
8081
required=True,
8182
dest="collated_data_path"
8283
)
8384
parser.add_argument(
8485
"--output",
85-
type=str,
86+
type=Path,
8687
required=True,
8788
dest="output_path",
8889
help="CSV of outliers."
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import argparse
2+
import os
3+
from contextlib import ExitStack
4+
from functools import partial
5+
from multiprocessing import cpu_count, Pool
6+
from pathlib import Path
7+
8+
import pandas as pd
9+
import yirgacheffe as yg
10+
11+
def process_species(
12+
aohs_path: Path,
13+
species_occurences: pd.DataFrame,
14+
) -> pd.DataFrame:
15+
16+
if len(species_occurences) == 0:
17+
species_occurences['occurence'] = None
18+
return species_occurences
19+
20+
taxon_ids = species_occurences.iucn_taxon_id.unique()
21+
if len(taxon_ids) > 1:
22+
raise ValueError("Too many taxon IDs")
23+
taxon_id = taxon_ids[0]
24+
25+
aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
26+
if len(aoh_files) == 0:
27+
species_occurences['occurence'] = False
28+
return species_occurences
29+
30+
with ExitStack() as stack:
31+
rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files]
32+
aoh = rasters[0]
33+
for raster in rasters [1:]:
34+
aoh += raster
35+
36+
results = []
37+
for _, row in species_occurences.iterrows():
38+
pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude)
39+
value = aoh.read_array(pixel_x, pixel_y, 1, 1)
40+
results.append(value > 0.0)
41+
42+
species_occurences['occurence'] = results
43+
return species_occurences
44+
45+
def validate_occurences(
46+
gbif_data_path: Path,
47+
aohs_path: Path,
48+
output_path: Path,
49+
process_count: int,
50+
) -> None:
51+
os.makedirs(output_path.parent, exist_ok=True)
52+
53+
# The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
54+
# iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
55+
occurences = pd.read_csv(gbif_data_path)
56+
occurences.drop(columns=['gbif_data', 'year'], inplace=True)
57+
occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
58+
occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
59+
with Pool(processes=process_count) as pool:
60+
results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species)
61+
results = pd.concat(results_per_species)
62+
results.to_csv(output_path)
63+
64+
def main() -> None:
65+
parser = argparse.ArgumentParser(description="Validate map prevalence.")
66+
parser.add_argument(
67+
'--gbif_data_path',
68+
type=Path,
69+
help="Data containing downloaded GBIF data.",
70+
required=True,
71+
dest="gbif_data_path"
72+
)
73+
parser.add_argument(
74+
'--aoh_results',
75+
type=Path,
76+
help="Path of all the AoH outputs.",
77+
required=True,
78+
dest="aohs_path"
79+
)
80+
parser.add_argument(
81+
"--output",
82+
type=Path,
83+
required=True,
84+
dest="output_path",
85+
help="CSV of outliers."
86+
)
87+
parser.add_argument(
88+
"-j",
89+
type=int,
90+
required=False,
91+
default=round(cpu_count() / 2),
92+
dest="processes_count",
93+
help="Optional number of concurrent threads to use."
94+
)
95+
args = parser.parse_args()
96+
97+
validate_occurences(
98+
args.gbif_data_path,
99+
args.aohs_path,
100+
args.output_path,
101+
args.process_count,
102+
)
103+
104+
if __name__ == "__main__":
105+
main()

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "aoh"
7-
version = "1.0.1"
7+
version = "1.1.0"
88
description = "A library for calculating Area of Habitat for species distribution mapping"
99
authors = [
1010
{name = "Michael Dales", email = "mwd24@cam.ac.uk"}
@@ -29,7 +29,7 @@ dependencies = [
2929
"psutil",
3030
"pyproj>=3.4,<4.0",
3131
"scikit-image>=0.20,<1.0",
32-
"yirgacheffe>=1.7.8,<2.0",
32+
"yirgacheffe>=1.9.1,<2.0",
3333
"zenodo_search",
3434
"pandas>=2.0,<3.0",
3535
"gdal[numpy]>=3.8,<3.12",
@@ -47,6 +47,7 @@ dev = [
4747
"pytest",
4848
"types-psutil",
4949
"types-requests",
50+
"types-shapely",
5051
"pandas-stubs",
5152
"geojson",
5253
"pytest-cov",
@@ -66,6 +67,7 @@ aoh-species-richness = "aoh.summaries.species_richness:main"
6667
aoh-endemism = "aoh.summaries.endemism:main"
6768
aoh-collate-data = "aoh.validation.collate_data:main"
6869
aoh-validate-prevalence = "aoh.validation.validate_map_prevalence:main"
70+
aoh-validate-occurences = "aoh.validation.validate_occurences:main"
6971
aoh-fetch-gbif-data = "aoh.validation.fetch_gbif_data:main"
7072

7173
[tool.setuptools]

tests/test_aohcalc.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,11 @@ def generate_species_info(
103103
"full_habitat_code": "|".join(sorted(list(habitat_codes))),
104104
}
105105
coordinates = [[
106-
[-90, -54],
107-
[90, -54],
108-
[90, 54],
109-
[-90, 54],
110-
[-90, -54],
106+
[-90, -45],
107+
[90, -45],
108+
[90, 45],
109+
[-90, 45],
110+
[-90, -45],
111111
]]
112112
polygon = geojson.Polygon(coordinates)
113113
feature= geojson.Feature(geometry=polygon, properties=properties)

tests/test_occurences.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import json
2+
import tempfile
3+
from pathlib import Path
4+
5+
import pandas as pd
6+
import pytest
7+
import yirgacheffe as yg
8+
from shapely.geometry import mapping, Polygon
9+
10+
from aoh.validation.validate_occurences import process_species
11+
12+
def test_empty_species_list() -> None:
13+
df = pd.DataFrame([], columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude'])
14+
res = process_species(Path("/some/aohs"), df)
15+
assert len(res) == 0
16+
17+
def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None:
18+
19+
shapes = [
20+
shape if shape is not None else Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])
21+
]
22+
23+
features = []
24+
for geom in shapes:
25+
feature = {
26+
"type": "Feature",
27+
"properties": {},
28+
"geometry": mapping(geom)
29+
}
30+
features.append(feature)
31+
32+
geojson = {
33+
"type": "FeatureCollection",
34+
"features": features
35+
}
36+
37+
with tempfile.TemporaryDirectory() as tmpdir:
38+
tmpdir_path = Path(tmpdir)
39+
geojson_path = tmpdir_path / "tmp.geojson"
40+
with open(geojson_path, 'w', encoding="UTF-8") as f:
41+
json.dump(geojson, f, indent=2)
42+
43+
with yg.read_shape(geojson_path, ("epsg:4326", (1.0, -1.0))) as shape_layer:
44+
shape_layer.to_geotiff(filename)
45+
46+
@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
47+
(42, 5.0, 5.0, True),
48+
(42, 12.0, 12.0, False),
49+
(40, 5.0, 5.0, False),
50+
])
51+
def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
52+
with tempfile.TemporaryDirectory() as tmpdir:
53+
tmpdir_path = Path(tmpdir)
54+
55+
for test_id in [41, 42, 43]:
56+
aoh_path = tmpdir_path / f"{test_id}.tif"
57+
generate_faux_aoh(aoh_path)
58+
59+
df = pd.DataFrame(
60+
[(taxon_id, latitude, longitude)],
61+
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
62+
)
63+
64+
res = process_species(tmpdir_path, df)
65+
66+
assert len(res) == len(df)
67+
occurence = res.occurence[0]
68+
assert occurence == expected
69+
70+
def test_multiple_match() -> None:
71+
with tempfile.TemporaryDirectory() as tmpdir:
72+
tmpdir_path = Path(tmpdir)
73+
74+
for test_id in [41, 42, 43]:
75+
aoh_path = tmpdir_path / f"{test_id}.tif"
76+
generate_faux_aoh(aoh_path)
77+
78+
df = pd.DataFrame(
79+
[
80+
(42, 5.0, 5.0, True),
81+
(42, 12.0, 12.0, False),
82+
],
83+
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
84+
)
85+
86+
res = process_species(tmpdir_path, df)
87+
88+
assert len(res) == len(df)
89+
assert (res.occurence == res.expected).all()
90+
91+
def test_too_many_ids() -> None:
92+
df = pd.DataFrame(
93+
[
94+
(42, 5.0, 5.0, True),
95+
(42, 12.0, 12.0, False),
96+
(40, 5.0, 5.0, False),
97+
],
98+
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude', 'expected']
99+
)
100+
101+
with pytest.raises(ValueError):
102+
_ = process_species(Path("/some/aohs"), df)
103+
104+
@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
105+
(42, 5.0, 5.0, True),
106+
(42, -5.0, -5.0, True),
107+
(42, 5.0, -5.0, False),
108+
(42, -5.0, 5.0, False),
109+
(40, 5.0, 5.0, False),
110+
])
111+
def test_find_seasonal(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
112+
with tempfile.TemporaryDirectory() as tmpdir:
113+
tmpdir_path = Path(tmpdir)
114+
115+
for season, shape in [
116+
('breeding', Polygon([(0, 0), (0, 10), (10, 10), (10, 0)])),
117+
('nonbreeding', Polygon([(0, 0), (0, -10), (-10, -10), (-10, 0)])),
118+
]:
119+
aoh_path = tmpdir_path / f"42_{season}.tif"
120+
generate_faux_aoh(aoh_path, shape)
121+
122+
df = pd.DataFrame(
123+
[(taxon_id, latitude, longitude)],
124+
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
125+
)
126+
127+
res = process_species(tmpdir_path, df)
128+
129+
assert len(res) == len(df)
130+
occurence = res.occurence[0]
131+
assert occurence == expected

0 commit comments

Comments
 (0)