diff --git a/CHANGES.md b/CHANGES.md index f5170f2..51a3c13 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,7 +8,7 @@ * Performance improvements and simplification to habitat processing. * Store more analysis data from model validation. -* Improve performance of GBIF occurence data fetches. +* Improve performance of GBIF occurrence data fetches. ### Fixed diff --git a/README.md b/README.md index 6c55c23..d971ed1 100644 --- a/README.md +++ b/README.md @@ -219,3 +219,62 @@ options: This will produce a CSV file listing just the AoH maps that fail model validation. **Note:** The validation tools require R to be installed on your system with the `lme4` and `lmerTest` packages. + +## aoh-fetch-gbif-data + +This command fetches occurrence data from [GBIF](https://gbif.org) to do occurrence checking as per Dahal et al. + +```bash +$ aoh-fetch-gbif-data --help +usage: aoh-fetch-gbif-data [-h] --collated_aoh_data COLLATED_DATA_PATH [--gbif_username GBIF_USERNAME] [--gbif_email GBIF_EMAIL] [--gbif_password GBIF_PASSWORD] --taxa TAXA --output_dir OUTPUT_DIR_PATH + +Fetch GBIF records for species for validation. + +options: + -h, --help show this help message and exit + --collated_aoh_data COLLATED_DATA_PATH + CSV containing collated AoH data + --gbif_username GBIF_USERNAME + Username of user's GBIF account. Can also be set in environment. + --gbif_email GBIF_EMAIL + E-mail of user's GBIF account. Can also be set in environment. + --gbif_password GBIF_PASSWORD + Password of user's GBIF account. Can also be set in environment. + --taxa TAXA + --output_dir OUTPUT_DIR_PATH + Destination directory for GBIF data. + +Environment Variables: + GBIF_USERNAME Username of user's GBIF account. + GBIF_EMAIL E-mail of user's GBIF account. + GBIF_PASSWORD Password of user's GBIF account. +``` + +Important notes: + +1. You will need a GBIF account for this. +2. This can take a long time, particularly for birds as there are so many records. +3. It can also generate a lot of data, hundreds of gigabytes worth, so ensure you have enough storage space! + +## aoh-validate-occurrences + +This command will run occurrence validation using the GBIF data fetched with the previous command. + +```bash +aoh-validate-occurences --help +usage: aoh-validate-occurences [-h] --gbif_data_path GBIF_DATA_PATH --species_data SPECIES_DATA_PATH --aoh_results AOHS_PATH --output OUTPUT_PATH [-j PROCESSES_COUNT] + +Validate occurrence prevelance. + +options: + -h, --help show this help message and exit + --gbif_data_path GBIF_DATA_PATH + Data containing downloaded GBIF data. + --species_data SPECIES_DATA_PATH + Path of all the species range data. + --aoh_results AOHS_PATH + Path of all the AoH outputs. + --output OUTPUT_PATH CSV of outliers. + -j PROCESSES_COUNT Optional number of concurrent threads to use. +``` + diff --git a/aoh/validation/README.md b/aoh/validation/README.md index 2077381..aff53cf 100644 --- a/aoh/validation/README.md +++ b/aoh/validation/README.md @@ -6,5 +6,5 @@ This directory contains the following scripts: * `collate_data.py` - When you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run. * `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper. -* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper. +* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurrence data on GBIF that can be used for point validation as per the Dahal et al paper. * `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs. diff --git a/aoh/validation/fetch_gbif_data.py b/aoh/validation/fetch_gbif_data.py index 4aec6c2..11e40bf 100644 --- a/aoh/validation/fetch_gbif_data.py +++ b/aoh/validation/fetch_gbif_data.py @@ -137,19 +137,56 @@ def build_point_validation_table( output_csv_path: Path, chunksize: int = 100_000, ) -> None: + # The challenge here is that the GBIF download format is problematic. For instance, the file extension + # is CSV, but the data within is clearly tab separated: https://www.gbif.org/faq?q=csv + # Parsing these files with pandas we get warnings of mixed data types in columns which seems to be it + # failing to deal with some escaping issues. Thus the read file here tries to be somewhat defensive + # but is not perfect. first_chunk = True - for chunk in pd.read_csv(gbif_data_path, sep='\t', chunksize=chunksize, on_bad_lines='skip'): + total_rows = 0 + skipped_rows = 0 + for chunk in pd.read_csv( + gbif_data_path, + sep='\t', # Is a TSV file despite the CSV file extension + chunksize=chunksize, + on_bad_lines='skip', # Skip rows with wrong number of columns + low_memory=False, # Avoid dtype warnings + encoding='utf-8', # GBIF uses UTF-8 + quotechar='"', # Standard quote character + escapechar='\\', # Handle escaped characters + ): + valid_mask = ( + pd.to_numeric(chunk['speciesKey'], errors='coerce').notna() & + pd.to_numeric(chunk['decimalLatitude'], errors='coerce').notna() & + pd.to_numeric(chunk['decimalLongitude'], errors='coerce').notna() & + pd.to_numeric(chunk['year'], errors='coerce').notna() + ) + + skipped_rows += (~valid_mask).sum() + chunk = chunk[valid_mask] + + # Force correct types once the invalid data has been removed + chunk['speciesKey'] = pd.to_numeric(chunk['speciesKey'], errors='coerce') + chunk['decimalLatitude'] = pd.to_numeric(chunk['decimalLatitude'], errors='coerce') + chunk['decimalLongitude'] = pd.to_numeric(chunk['decimalLongitude'], errors='coerce') + chunk['year'] = pd.to_numeric(chunk['year'], errors='coerce') + chunk.rename(columns={"speciesKey": "gbif_id"}, inplace=True) updated_data = chunk.merge(map_df, on="gbif_id", how='inner') necessary_columns = updated_data[["iucn_taxon_id", "gbif_id", "decimalLatitude", "decimalLongitude", "year"]] + necessary_columns.to_csv( output_csv_path, mode='w' if first_chunk else 'a', header=first_chunk, index=False ) + total_rows += len(necessary_columns) first_chunk = False + print(f"Wrote {total_rows} rows to {output_csv_path}") + print(f"Skipped {skipped_rows} rows due to invalid/missing data") + def fetch_gbif_data( collated_data_path: Path, taxa: str, diff --git a/aoh/validation/validate_occurences.py b/aoh/validation/validate_occurences.py index b34133c..ff89d15 100644 --- a/aoh/validation/validate_occurences.py +++ b/aoh/validation/validate_occurences.py @@ -36,7 +36,7 @@ def process_species( raise ValueError("Too many taxon IDs") taxon_id = taxon_ids[0] - aoh_files = list(aohs_path.glob(f"**/{taxon_id}*.tif")) + aoh_files = list(aohs_path.glob(f"**/{taxon_id}_*.tif")) # We here are aborting on those species with no data or those # with multiple seasons if len(aoh_files) == 0: @@ -49,9 +49,11 @@ def process_species( with open(aoh_data_path, 'r', encoding='utf-8') as f: aoh_data = json.load(f) - species_data_files = list(species_data_path.glob(f"**/{taxon_id}*.geojson")) + species_data_files = list(species_data_path.glob(f"**/{taxon_id}_*.geojson")) if len(species_data_files) != 1: - raise RuntimeError(f"We expected one JSON file beside the GeoTIFF, we found {len(species_data_files)}") + raise RuntimeError( + f"We expected one GeoJSON file beside the GeoTIFF, we found {len(species_data_files)} for {taxon_id}" + ) species_range = gpd.read_file(species_data_files[0]) # From Dahal et al: "This ensured that we only included points which fell inside @@ -148,14 +150,18 @@ def validate_occurrences( occurrences.sort_values(['iucn_taxon_id', 'decimalLatitude'], inplace=True) occurrences_per_species = [group for _, group in occurrences.groupby('iucn_taxon_id')] with Pool(processes=process_count) as pool: - results_per_species = pool.map(partial(process_species, aohs_path, species_data_path), occurrences_per_species) + results_per_species = pool.map(partial( + process_species_wrapper, + aohs_path, + species_data_path + ), occurrences_per_species) cleaned_results = [x for x in results_per_species if x is not None] summary = pd.DataFrame(cleaned_results) summary.to_csv(output_path, index=False) def main() -> None: - parser = argparse.ArgumentParser(description="Validate map prevalence.") + parser = argparse.ArgumentParser(description="Validate occurrence prevelance.") parser.add_argument( '--gbif_data_path', type=Path, diff --git a/tests/test_occurences.py b/tests/test_occurences.py index 7e5a0f8..672eae2 100644 --- a/tests/test_occurences.py +++ b/tests/test_occurences.py @@ -98,7 +98,7 @@ def test_simple_match_in_out_range( tmpdir_path = Path(tmpdir) for test_id in [41, 42, 43]: - aoh_path = tmpdir_path / f"{test_id}.tif" + aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif" generate_faux_aoh(aoh_path) occurences = generate_occurrence_cluster(latitude, longitude, 20, 2.0) @@ -116,7 +116,7 @@ def test_simple_match_in_out_range( (42, 0.0, 0.0, 1.0, True, False), # all in AoH (42, 0.0, 20.0, None, False, None), # all out of range ]) -def test_model_prevalence_of_one( +def test_occurrence_prevalence_of_one( taxon_id: int, latitude: float, longitude: float, @@ -128,7 +128,7 @@ def test_model_prevalence_of_one( tmpdir_path = Path(tmpdir) for test_id in [41, 42, 43]: - aoh_path = tmpdir_path / f"{test_id}.tif" + aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif" generate_faux_aoh(aoh_path, aoh_radius=5.0, range_radius=5.0) occurences = generate_occurrence_cluster(latitude, longitude, 20, 2.0) @@ -150,7 +150,7 @@ def test_no_aoh_found() -> None: tmpdir_path = Path(tmpdir) for test_id in [41, 42, 43]: - aoh_path = tmpdir_path / f"{test_id}.tif" + aoh_path = tmpdir_path / f"{test_id}_RESIDENT.tif" generate_faux_aoh(aoh_path) df = pd.DataFrame(