Skip to content

Commit d809377

Browse files
committed
WIP: make new occurrence path more testable
1 parent fc45277 commit d809377

File tree

3 files changed

+51
-25
lines changed

3 files changed

+51
-25
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55

6+
.DS_Store
7+
68
# C extensions
79
*.so
810

@@ -163,4 +165,4 @@ cython_debug/
163165
.vscode/
164166

165167
# Claude Code
166-
.claude/
168+
.claude/

aoh/validation/validate_occurences.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,24 @@
1313
def process_species(
1414
aohs_path: Path,
1515
species_data_path: Path,
16-
species_occurences: pd.DataFrame,
17-
) -> tuple[int, int, int, float, float, bool] | None:
18-
19-
os.environ["OGR_GEOJSON_MAX_OBJ_SIZE"] = "0"
16+
species_occurrences: pd.DataFrame,
17+
) -> tuple[int, int, int, float, float, bool]:
2018

21-
if len(species_occurences) == 0:
22-
return None
19+
if len(species_occurrences) == 0:
20+
raise ValueError("No occurrences")
2321

24-
taxon_ids = species_occurences.iucn_taxon_id.unique()
22+
taxon_ids = species_occurrences.iucn_taxon_id.unique()
2523
if len(taxon_ids) > 1:
2624
raise ValueError("Too many taxon IDs")
2725
taxon_id = taxon_ids[0]
2826

2927
aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif"))
3028
# We here are aborting on those species with no data or those
3129
# with multiple seasons
32-
if len(aoh_files) != 1:
33-
return None
30+
if len(aoh_files) == 0:
31+
raise FileNotFoundError("No AOHs found")
32+
if len(aoh_files) > 1:
33+
raise NotImplementedError("Multi-season AOHs not yet supported")
3434

3535
aoh_tiff_path = aoh_files[0]
3636
aoh_data_path = aoh_tiff_path.with_suffix(".json")
@@ -39,17 +39,17 @@ def process_species(
3939

4040
species_data_files = list(species_data_path.glob(f"**/{taxon_id}*.geojson"))
4141
if len(species_data_files) != 1:
42-
return None
42+
raise RuntimeError(f"We expected one JSON file beside the GeoTIFF, we found {len(species_data_files)}")
4343
species_range = gpd.read_file(species_data_files[0])
4444

4545
# From Dahal et al: "This ensured that we only included points which fell inside
4646
# the boundaries of the selected range maps."
4747
points_gdf = gpd.GeoDataFrame(
48-
species_occurences,
48+
species_occurrences,
4949
geometry=[
5050
Point(lon, lat)
5151
for lon, lat in
52-
zip(species_occurences['decimalLongitude'], species_occurences['decimalLatitude'])
52+
zip(species_occurrences['decimalLongitude'], species_occurrences['decimalLatitude'])
5353
],
5454
crs='EPSG:4326',
5555
)
@@ -73,7 +73,7 @@ def process_species(
7373
# From Dahal et al: "Finally, we excluded species which had fewer than 10 point localities after
7474
# all the filters were applied."
7575
if len(results) < 10:
76-
return None
76+
raise ValueError("Not enough occurrences")
7777

7878
matches = len([x for x in results if x])
7979
point_prevalence = matches / len(results)
@@ -87,7 +87,20 @@ def process_species(
8787
point_prevalence <= model_prevalence
8888
)
8989

90-
def validate_occurences(
90+
def process_species_wrapper(
91+
aohs_path: Path,
92+
species_data_path: Path,
93+
species_occurrences: pd.DataFrame,
94+
) -> tuple[int, int, int, float, float, bool] | None:
95+
# This wrapper exists to make it easier to write unit tests for process species by having it thrown
96+
# unique exceptions for each failure, but allowing us to use pool.map to invoke it which won't
97+
# tolerate those.
98+
try:
99+
return process_species(aohs_path, species_data_path, species_occurrences)
100+
except (ValueError, FileNotFoundError, RuntimeError, NotImplementedError):
101+
return None
102+
103+
def validate_occurrences(
91104
gbif_data_path: Path,
92105
aohs_path: Path,
93106
species_data_path: Path,
@@ -98,12 +111,12 @@ def validate_occurences(
98111

99112
# The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
100113
# iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
101-
occurences = pd.read_csv(gbif_data_path)
102-
occurences.drop(columns=['gbif_id', 'year'], inplace=True)
103-
occurences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
104-
occurences_per_species = [group for _, group in occurences.groupby('iucn_taxon_id')]
114+
occurrences = pd.read_csv(gbif_data_path)
115+
occurrences.drop(columns=['gbif_id', 'year'], inplace=True)
116+
occurrences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True)
117+
occurrences_per_species = [group for _, group in occurrences.groupby('iucn_taxon_id')]
105118
with Pool(processes=process_count) as pool:
106-
results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurences_per_species)
119+
results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurrences_per_species)
107120
cleaned_results = [x for x in results_per_species if x is not None]
108121

109122
summary = pd.DataFrame(cleaned_results, columns=[
@@ -157,7 +170,7 @@ def main() -> None:
157170
)
158171
args = parser.parse_args()
159172

160-
validate_occurences(
173+
validate_occurrences(
161174
args.gbif_data_path,
162175
args.aohs_path,
163176
args.species_data_path,

tests/test_occurences.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ def generate_faux_aoh(filename: Path, shape: Polygon | None = None) -> None:
4747
@pytest.mark.parametrize("taxon_id,latitude,longitude,expected",[
4848
(42, 5.0, 5.0, True),
4949
(42, 12.0, 12.0, False),
50-
(40, 5.0, 5.0, False),
5150
])
5251
def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
5352
with tempfile.TemporaryDirectory() as tmpdir:
@@ -61,18 +60,30 @@ def test_simple_match(taxon_id: int, latitude: float, longitude: float, expected
6160
[(taxon_id, latitude, longitude)],
6261
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
6362
)
64-
import os
65-
print(os.listdir(tmpdir_path))
6663
res = process_species(tmpdir_path, tmpdir_path, df)
6764

68-
assert res is not None
6965
id_no, results, matches, point_prev, model_prev, outlier = res
7066
assert id_no == taxon_id
7167
assert results == 1
7268
assert matches == (1 if expected else 0)
7369
assert model_prev == 1.0
7470
assert outlier == expected
7571

72+
def test_no_aoh_found(taxon_id: int, latitude: float, longitude: float, expected: bool) -> None:
73+
with tempfile.TemporaryDirectory() as tmpdir:
74+
tmpdir_path = Path(tmpdir)
75+
76+
for test_id in [41, 42, 43]:
77+
aoh_path = tmpdir_path / f"{test_id}.tif"
78+
generate_faux_aoh(aoh_path)
79+
80+
df = pd.DataFrame(
81+
[(40, 5.0, 5.0)],
82+
columns=['iucn_taxon_id', 'decimalLatitude', 'decimalLongitude']
83+
)
84+
with pytest.raises(FileNotFoundError):
85+
_ = process_species(tmpdir_path, tmpdir_path, df)
86+
7687
def test_multiple_match() -> None:
7788
with tempfile.TemporaryDirectory() as tmpdir:
7889
tmpdir_path = Path(tmpdir)

0 commit comments

Comments
 (0)