Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ __pycache__/
*.py[cod]
*$py.class

.DS_Store

# C extensions
*.so

Expand Down Expand Up @@ -163,4 +165,4 @@ cython_debug/
.vscode/

# Claude Code
.claude/
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shaneweisz had you removed this in your last PR? Not sure how this got added to mine - I wonder if it's from the merge conflict I had to do

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I added it in my last PR. Perhaps got removed accidentally during the merge conflict – I do think it makes sense to have it in the .gitignore like you've done 👍

.claude/
18 changes: 13 additions & 5 deletions aoh/validation/fetch_gbif_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,20 @@ def build_point_validation_table(
gbif_data_path: Path,
map_df: pd.DataFrame,
output_csv_path: Path,
chunksize: int = 100_000,
) -> None:
gbif_data = pd.read_csv(gbif_data_path, sep='\t')
gbif_data.rename(columns={"speciesKey": "gbif_id"}, inplace=True)
updated_data = gbif_data.merge(map_df, on="gbif_id", how='inner')
necessary_columns = updated_data[["iucn_taxon_id", "gbif_id", "decimalLatitude", "decimalLongitude", "year"]]
necessary_columns.to_csv(output_csv_path, index=False)
first_chunk = True
for chunk in pd.read_csv(gbif_data_path, sep='\t', chunksize=chunksize, on_bad_lines='skip'):
chunk.rename(columns={"speciesKey": "gbif_id"}, inplace=True)
updated_data = chunk.merge(map_df, on="gbif_id", how='inner')
necessary_columns = updated_data[["iucn_taxon_id", "gbif_id", "decimalLatitude", "decimalLongitude", "year"]]
necessary_columns.to_csv(
output_csv_path,
mode='w' if first_chunk else 'a',
header=first_chunk,
index=False
)
first_chunk = False

def fetch_gbif_data(
collated_data_path: Path,
Expand Down
159 changes: 130 additions & 29 deletions aoh/validation/validate_occurences.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,158 @@
import argparse
import json
import os
from contextlib import ExitStack
from functools import partial
from multiprocessing import cpu_count, Pool
from pathlib import Path
from typing import NamedTuple

import geopandas as gpd
import pandas as pd
import yirgacheffe as yg
from shapely.geometry import Point

class Record(NamedTuple):
iucn_taxon_id: int
total_records: int
clipped_points: int
unique_points: int
matches: int
point_prevalence: float | None
model_prevalence: float
is_valid: bool
is_outlier: bool | None

def process_species(
aohs_path: Path,
species_occurences: pd.DataFrame,
) -> pd.DataFrame:
species_data_path: Path,
species_occurrences: pd.DataFrame,
) -> Record:

if len(species_occurences) == 0:
species_occurences['occurence'] = None
return species_occurences
if len(species_occurrences) == 0:
raise ValueError("No occurrences")

taxon_ids = species_occurences.iucn_taxon_id.unique()
taxon_ids = species_occurrences.iucn_taxon_id.unique()
if len(taxon_ids) > 1:
raise ValueError("Too many taxon IDs")
taxon_id = taxon_ids[0]

aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
# We here are aborting on those species with no data or those
# with multiple seasons
if len(aoh_files) == 0:
species_occurences['occurence'] = False
return species_occurences
raise FileNotFoundError("No AOHs found")
if len(aoh_files) > 1:
raise NotImplementedError("Multi-season AOHs not yet supported")

aoh_tiff_path = aoh_files[0]
aoh_data_path = aoh_tiff_path.with_suffix(".json")
with open(aoh_data_path, 'r', encoding='utf-8') as f:
aoh_data = json.load(f)

with ExitStack() as stack:
rasters = [stack.enter_context(yg.read_raster(x)) for x in aoh_files]
aoh = rasters[0]
for raster in rasters [1:]:
aoh += raster
species_data_files = list(species_data_path.glob(f"**/{taxon_id}*.geojson"))
if len(species_data_files) != 1:
raise RuntimeError(f"We expected one JSON file beside the GeoTIFF, we found {len(species_data_files)}")
species_range = gpd.read_file(species_data_files[0])

# From Dahal et al: "This ensured that we only included points which fell inside
# the boundaries of the selected range maps."
points_gdf = gpd.GeoDataFrame(
species_occurrences,
geometry=[
Point(lon, lat)
for lon, lat in
zip(species_occurrences['decimalLongitude'], species_occurrences['decimalLatitude'])
],
crs='EPSG:4326',
)
clipped_points = gpd.sjoin(points_gdf, species_range, predicate='within', how='inner')

pixel_set = set()
with yg.read_raster(aoh_files[0]) as aoh:
results = []
for _, row in species_occurences.iterrows():
for _, row in clipped_points.iterrows():
pixel_x, pixel_y = aoh.pixel_for_latlng(row.decimalLatitude, row.decimalLongitude)

# Dahal et al: "We also made sure that only one point locality was allowed per pixel of
# the AOH map to avoid clustering of points."
if (pixel_x, pixel_y) in pixel_set:
continue
pixel_set.add((pixel_x, pixel_y))

value = aoh.read_array(pixel_x, pixel_y, 1, 1)
results.append(value > 0.0)
results.append(value[0][0] > 0.0)

# From Dahal et al: "Finally, we excluded species which had fewer than 10 point localities after
# all the filters were applied."
is_valid = len(results) >= 10

model_prevalence = aoh_data['prevalence']
matches = len([x for x in results if x])
if is_valid:
point_prevalence = matches / len(results)

# From Dahal et al: "If the point prevalence exceeded model prevalence at
# species level, the AOH maps performed better than random,
# otherwise they were no better than random."
#
# However, note that this means if you have a point prevalence of 1.0 (all
# points match) and a model prevalence of 1.0 (range and AOH match, which
# under the IUCN method is the preferred fallback if we have zero on either
# elevation filtering or habitat filtering), then that would still be marked
# as an outlier, (as 1.0 is not exceeding 1.0) which seems wrong, so I'm
# special casing that.
is_outlier = (point_prevalence != 1.0) and (point_prevalence < model_prevalence)
else:
point_prevalence = None
is_outlier = None

species_occurences['occurence'] = results
return species_occurences

def validate_occurences(
return Record(
taxon_id, # Species Redlist ID
len(species_occurrences), # Raw number of occurrences from GBIF
len(clipped_points), # Number of occurrences clipped to species range
len(results), # Number of unique occurrences by pixel
matches, # Number of occurrences within AOH
point_prevalence, # Point prevalence as per Dahal et al
model_prevalence, # Model prevalence as per Dahal et al
is_valid, # Whether we consider the result valid
is_outlier, # Whether species is considered an outlier
)

def process_species_wrapper(
aohs_path: Path,
species_data_path: Path,
species_occurrences: pd.DataFrame,
) -> Record | None:
# This wrapper exists to make it easier to write unit tests for process species by having it thrown
# unique exceptions for each failure, but allowing us to use pool.map to invoke it which won't
# tolerate those.
try:
return process_species(aohs_path, species_data_path, species_occurrences)
except (ValueError, FileNotFoundError, RuntimeError, NotImplementedError):
return None

def validate_occurrences(
gbif_data_path: Path,
aohs_path: Path,
species_data_path: Path,
output_path: Path,
process_count: int,
) -> None:
os.makedirs(output_path.parent, exist_ok=True)

# The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
# iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
occurences = pd.read_csv(gbif_data_path)
occurences.drop(columns=['gbif_data', 'year'], inplace=True)
occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
occurrences = pd.read_csv(gbif_data_path)
occurrences.drop(columns=['gbif_id', 'year'], inplace=True)
occurrences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
occurrences_per_species = [group for _, group in occurrences.groupby('iucn_taxon_id')]
with Pool(processes=process_count) as pool:
results_per_species = pool.map(partial(process_species, aohs_path), occurences_per_species)
results = pd.concat(results_per_species)
results.to_csv(output_path)
results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurrences_per_species)
cleaned_results = [x for x in results_per_species if x is not None]

summary = pd.DataFrame(cleaned_results)
summary.to_csv(output_path, index=False)

def main() -> None:
parser = argparse.ArgumentParser(description="Validate map prevalence.")
Expand All @@ -70,6 +163,13 @@ def main() -> None:
required=True,
dest="gbif_data_path"
)
parser.add_argument(
'--species_data',
type=Path,
help="Path of all the species range data.",
required=True,
dest="species_data_path",
)
parser.add_argument(
'--aoh_results',
type=Path,
Expand All @@ -88,17 +188,18 @@ def main() -> None:
"-j",
type=int,
required=False,
default=round(cpu_count() / 2),
default=cpu_count(),
dest="processes_count",
help="Optional number of concurrent threads to use."
)
args = parser.parse_args()

validate_occurences(
validate_occurrences(
args.gbif_data_path,
args.aohs_path,
args.species_data_path,
args.output_path,
args.process_count,
args.processes_count,
)

if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ dev = [
"pylint",
"mypy",
"pytest",
"types-geopandas",
"types-psutil",
"types-requests",
"types-shapely",
Expand Down
Loading