Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,9 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# VS Code
.vscode/

# Claude Code
.claude/
2 changes: 1 addition & 1 deletion aoh/validation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This directory contains code to implement the model base validation proposed by

This directory contains the following scripts:

* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
* `collate_data.py` - When you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a right of passage for working on any code base I've touched - I'm sorry :)

* `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
* `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
* `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.
143 changes: 131 additions & 12 deletions aoh/validation/validate_map_prevalence.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,90 @@
except (ImportError, ValueError):
pymer4 = None

def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
if pymer4 is None:
raise ImportError("pymer4 is required for model validation but not installed. "
"This requires R to be installed on the system.")
def generate_results_summary(aoh_df: pd.DataFrame, outliers: pd.DataFrame) -> str:
summary_content = (
"# Model Validation Results Summary\n\n"
+ "## Summary Statistics\n\n"
+ f"- **Total species analyzed**: {len(aoh_df[aoh_df.aoh_total > 0])}\n"
+ f"- **Species with no AOH**: {len(aoh_df[aoh_df.aoh_total == 0])}\n"
+ f"- **Total outliers detected**: {len(outliers)}\n\n"
+ "## Species by Taxonomic Class\n"
)

# Count species by class
class_counts = aoh_df.groupby('class_name').size().to_dict()
outlier_counts = outliers.groupby('class_name').size().to_dict()

for class_name in sorted(class_counts.keys()):
total = class_counts.get(class_name, 0)
outlier_count = outlier_counts.get(class_name, 0)
outlier_pct = (outlier_count / total * 100) if total > 0 else 0
summary_content += f"- **{class_name}**: {total} species, {outlier_count} outliers ({outlier_pct:.1f}%)\n"

return summary_content

def add_diagnostic_columns(
klass_df: pd.DataFrame,
upper_fence: float,
lower_fence: float
) -> pd.DataFrame:
# Calculate class means for comparison
klass_means = klass_df[['elevation_rangekm', 'elevation_midkm', 'n_habitats', 'prevalence']].mean()

# Outlier flags and type
klass_df['outlier_type'] = 'normal'
klass_df.loc[klass_df.fit_diff < lower_fence, 'outlier_type'] = 'over-predicted'
klass_df.loc[klass_df.fit_diff > upper_fence, 'outlier_type'] = 'under-predicted'

# Human-readable explanation
klass_df['explanation'] = 'Within normal range'
klass_df.loc[klass_df.outlier_type == 'under-predicted', 'explanation'] = (
'Observed prevalence (' + klass_df['prevalence'].round(3).astype(str) +
') much higher than predicted (' + klass_df['fit'].round(3).astype(str) + ')'
)
klass_df.loc[klass_df.outlier_type == 'over-predicted', 'explanation'] = (
'Observed prevalence (' + klass_df['prevalence'].round(3).astype(str) +
') much lower than predicted (' + klass_df['fit'].round(3).astype(str) + ')'
)

# Context comparison - percentage difference from class mean
klass_df['elevation_range_vs_class_mean'] = (
((klass_df['elevation_rangekm'] - klass_means['elevation_rangekm']) /
klass_means['elevation_rangekm'] * 100).round(1).astype(str) + '%'
)
klass_df['elevation_mid_vs_class_mean'] = (
((klass_df['elevation_midkm'] - klass_means['elevation_midkm']) /
klass_means['elevation_midkm'] * 100).round(1).astype(str) + '%'
)
klass_df['n_habitats_vs_class_mean'] = (
((klass_df['n_habitats'] - klass_means['n_habitats']) /
klass_means['n_habitats'] * 100).round(1).astype(str) + '%'
)
klass_df['prevalence_vs_class_mean'] = (
((klass_df['prevalence'] - klass_means['prevalence']) /
klass_means['prevalence'] * 100).round(1).astype(str) + '%'
)

return klass_df

# Ger rid of any where we had no AoH
aoh_df = aoh_df[aoh_df.prevalence > 0]
def extract_model_coefficients(model: "pymer4.models.Lmer", class_name: str) -> pd.DataFrame:
coef_df = model.coefs.copy()
# Normalize to have explicit variable column for easier downstream pivoting
coef_df = coef_df.reset_index().rename(columns={'index': 'variable'})
coef_df['class_name'] = class_name
return coef_df

def extract_random_effects(model: "pymer4.models.Lmer", class_name: str) -> pd.DataFrame:
ranef_df = model.ranef.copy()
ranef_df['class_name'] = class_name
ranef_df = ranef_df.reset_index()
# pymer4 uses 'X.Intercept.' as the column name for random intercepts
intercept_col = [col for col in ranef_df.columns if 'Intercept' in col][0]
ranef_df = ranef_df.rename(columns={'index': 'family_name', intercept_col: 'random_effect'})
return ranef_df

def add_predictors_to_aoh_df(aoh_df: pd.DataFrame) -> pd.DataFrame:
"""Calculate and standardize predictor variables."""

aoh_df['elevation_range'] = aoh_df['elevation_upper'] - aoh_df['elevation_lower']
aoh_df['elevation_mid'] = (aoh_df['elevation_upper'] + aoh_df['elevation_lower']) / 2
Expand All @@ -35,12 +112,29 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
aoh_df['std_n_habitats'] = (aoh_df.n_habitats - means.n_habitats) \
/ standard_devs.n_habitats

per_class_df = []
return aoh_df

def model_validation(aoh_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
if pymer4 is None:
raise ImportError("pymer4 is required for model validation but not installed. "
"This requires R to be installed on the system.")

# Get rid of any where we had no AoH
aoh_df = aoh_df[aoh_df.prevalence > 0].copy()

# Prepare predictor variables
aoh_df = add_predictors_to_aoh_df(aoh_df)

# Get unique taxonomic classes
klasses = aoh_df.class_name.unique()
if len(klasses) == 0:
raise ValueError("No species classes were found")

# Fit models for each class
per_class_outliers_df = []
per_class_model_coefficients = []
per_class_random_effects = []

for klass in klasses:
klass_df = aoh_df[aoh_df.class_name == klass].copy()
print(f"{klass}:\n\taohs: {len(klass_df)}")
Expand All @@ -56,21 +150,46 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
q1 = klass_df.fit_diff.quantile(q=0.25)
q3 = klass_df.fit_diff.quantile(q=0.75)
iqr = q3 - q1
lower_fence = q1 - (1.5 * iqr)
upper_fence = q3 + (1.5 * iqr)

klass_df['outlier'] = (klass_df.fit_diff > q3 + (1.5 * iqr)) | (klass_df.fit_diff < (q1 - (1.5 * iqr)))
klass_df['outlier'] = (klass_df.fit_diff > upper_fence ) | (klass_df.fit_diff < lower_fence )
klass_df = add_diagnostic_columns(klass_df, upper_fence, lower_fence)
klass_outliers = klass_df[klass_df.outlier == True] # pylint: disable = C0121
print(f"\toutliers: {len(klass_outliers)}")
per_class_df.append(klass_outliers)
per_class_outliers_df.append(klass_outliers)

coef_df = extract_model_coefficients(model, klass)
per_class_model_coefficients.append(coef_df)

ranef_df = extract_random_effects(model, klass)
per_class_random_effects.append(ranef_df)

# Concatenate results
outliers_df = pd.concat(per_class_outliers_df) # type: ignore[arg-type]
model_coefficients_df = pd.concat(per_class_model_coefficients) # type: ignore[arg-type]
random_effects_df = pd.concat(per_class_random_effects) # type: ignore[arg-type]

return pd.concat(per_class_df) # type: ignore[no-any-return]
return outliers_df, model_coefficients_df, random_effects_df

def validate_map_prevalence(
collated_data_path: Path,
output_path: Path,
) -> None:
aoh_df = pd.read_csv(collated_data_path)
outliers = model_validation(aoh_df)
outliers.to_csv(output_path)
outliers, model_coefficients, random_effects = model_validation(aoh_df)
outliers.to_csv(output_path, index=False)

# Save useful model diagnostic files
output_dir = output_path.parent
aoh_df[aoh_df.aoh_total == 0].to_csv(output_dir / "species_with_no_aoh.csv", index=False)
model_coefficients.pivot(
index='class_name', columns='variable', values='Estimate'
).to_csv(output_dir / "model_coefficients.csv", index=True)
random_effects.to_csv(output_dir / "random_effects.csv", index=False)
with open(output_dir / "summary.md", 'w', encoding='utf-8') as f:
summary_content = generate_results_summary(aoh_df, outliers)
f.write(summary_content)

def main() -> None:
parser = argparse.ArgumentParser(description="Validate map prevalence.")
Expand Down