1313def process_species (
1414 aohs_path : Path ,
1515 species_data_path : Path ,
16- species_occurences : pd .DataFrame ,
17- ) -> tuple [int , int , int , float , float , bool ] | None :
18-
19- os .environ ["OGR_GEOJSON_MAX_OBJ_SIZE" ] = "0"
16+ species_occurrences : pd .DataFrame ,
17+ ) -> tuple [int , int , int , float , float , bool ]:
2018
21- if len (species_occurences ) == 0 :
22- return None
19+ if len (species_occurrences ) == 0 :
20+ raise ValueError ( "No occurrences" )
2321
24- taxon_ids = species_occurences .iucn_taxon_id .unique ()
22+ taxon_ids = species_occurrences .iucn_taxon_id .unique ()
2523 if len (taxon_ids ) > 1 :
2624 raise ValueError ("Too many taxon IDs" )
2725 taxon_id = taxon_ids [0 ]
2826
2927 aoh_files = list (aohs_path .glob (f"**/{ taxon_id } *.tif" ))
3028 # We here are aborting on those species with no data or those
3129 # with multiple seasons
32- if len (aoh_files ) != 1 :
33- return None
30+ if len (aoh_files ) == 0 :
31+ raise FileNotFoundError ("No AOHs found" )
32+ if len (aoh_files ) > 1 :
33+ raise NotImplementedError ("Multi-season AOHs not yet supported" )
3434
3535 aoh_tiff_path = aoh_files [0 ]
3636 aoh_data_path = aoh_tiff_path .with_suffix (".json" )
@@ -39,17 +39,17 @@ def process_species(
3939
4040 species_data_files = list (species_data_path .glob (f"**/{ taxon_id } *.geojson" ))
4141 if len (species_data_files ) != 1 :
42- return None
42+ raise RuntimeError ( f"We expected one JSON file beside the GeoTIFF, we found { len ( species_data_files ) } " )
4343 species_range = gpd .read_file (species_data_files [0 ])
4444
4545 # From Dahal et al: "This ensured that we only included points which fell inside
4646 # the boundaries of the selected range maps."
4747 points_gdf = gpd .GeoDataFrame (
48- species_occurences ,
48+ species_occurrences ,
4949 geometry = [
5050 Point (lon , lat )
5151 for lon , lat in
52- zip (species_occurences ['decimalLongitude' ], species_occurences ['decimalLatitude' ])
52+ zip (species_occurrences ['decimalLongitude' ], species_occurrences ['decimalLatitude' ])
5353 ],
5454 crs = 'EPSG:4326' ,
5555 )
@@ -73,7 +73,7 @@ def process_species(
7373 # From Dahal et al: "Finally, we excluded species which had fewer than 10 point localities after
7474 # all the filters were applied."
7575 if len (results ) < 10 :
76- return None
76+ raise ValueError ( "Not enough occurrences" )
7777
7878 matches = len ([x for x in results if x ])
7979 point_prevalence = matches / len (results )
@@ -87,7 +87,20 @@ def process_species(
8787 point_prevalence <= model_prevalence
8888 )
8989
90- def validate_occurences (
90+ def process_species_wrapper (
91+ aohs_path : Path ,
92+ species_data_path : Path ,
93+ species_occurrences : pd .DataFrame ,
94+ ) -> tuple [int , int , int , float , float , bool ] | None :
95+ # This wrapper exists to make it easier to write unit tests for process species by having it thrown
96+ # unique exceptions for each failure, but allowing us to use pool.map to invoke it which won't
97+ # tolerate those.
98+ try :
99+ return process_species (aohs_path , species_data_path , species_occurrences )
100+ except (ValueError , FileNotFoundError , RuntimeError , NotImplementedError ):
101+ return None
102+
103+ def validate_occurrences (
91104 gbif_data_path : Path ,
92105 aohs_path : Path ,
93106 species_data_path : Path ,
@@ -98,12 +111,12 @@ def validate_occurences(
98111
99112 # The input is from the points.csv generated by fetch_gbif_data.py, which has the columns:
100113 # iucn_taxon_id, gbif_id, decimalLatitude, decimalLongitude, assessment year
101- occurences = pd .read_csv (gbif_data_path )
102- occurences .drop (columns = ['gbif_id' , 'year' ], inplace = True )
103- occurences .sort_values (['iucn_taxon_id' , 'decimalLatitude' ], inplace = True )
104- occurences_per_species = [group for _ , group in occurences .groupby ('iucn_taxon_id' )]
114+ occurrences = pd .read_csv (gbif_data_path )
115+ occurrences .drop (columns = ['gbif_id' , 'year' ], inplace = True )
116+ occurrences .sort_values (['iucn_taxon_id' , 'decimalLatitude' ], inplace = True )
117+ occurrences_per_species = [group for _ , group in occurrences .groupby ('iucn_taxon_id' )]
105118 with Pool (processes = process_count ) as pool :
106- results_per_species = pool .map (partial (process_species , aohs_path , species_data_path ), occurences_per_species )
119+ results_per_species = pool .map (partial (process_species , aohs_path , species_data_path ), occurrences_per_species )
107120 cleaned_results = [x for x in results_per_species if x is not None ]
108121
109122 summary = pd .DataFrame (cleaned_results , columns = [
@@ -157,7 +170,7 @@ def main() -> None:
157170 )
158171 args = parser .parse_args ()
159172
160- validate_occurences (
173+ validate_occurrences (
161174 args .gbif_data_path ,
162175 args .aohs_path ,
163176 args .species_data_path ,
0 commit comments