Merge pull request #29 from quantifyearth/sw-model-validation

mdales · web-flow · commit 6fc2c22cf0db · 2025-11-01T09:26:22.000Z
Add model validation diagnostics
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,9 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# VS Code
+.vscode/
+
+# Claude Code
+.claude/
diff --git a/aoh/validation/README.md b/aoh/validation/README.md
@@ -4,7 +4,7 @@ This directory contains code to implement the model base validation proposed by
 
 This directory contains the following scripts:
 
-* `collate_data.py` - Then you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
+* `collate_data.py` - When you generate a series of AOH GeoTIFFs, besides each one is a JSON file that contains information required for validation. This script takes a folder containing the AOH output of a run and collates all those JSON files into a single CSV file that can be used for a validation run.
 * `validate_map_prevalence.py` - This uses the data in the collated CSV to do a model validation as per the Dahal et al paper.
 * `fetch_gbif_data.py` - This script takes the collated CSV file and attempts to find occurence data on GBIF that can be used for point validation as per the Dahal et al paper.
 * `validate_occurences.py` - This uses the data fetched from GBIF to check the occurrences against a coprus of AOHs.
diff --git a/aoh/validation/validate_map_prevalence.py b/aoh/validation/validate_map_prevalence.py
@@ -11,13 +11,90 @@
 except (ImportError, ValueError):
     pymer4 = None
 
-def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
-    if pymer4 is None:
-        raise ImportError("pymer4 is required for model validation but not installed. "
-                         "This requires R to be installed on the system.")
+def generate_results_summary(aoh_df: pd.DataFrame, outliers: pd.DataFrame) -> str:
+    summary_content = (
+        "# Model Validation Results Summary\n\n"
+        + "## Summary Statistics\n\n"
+        + f"- **Total species analyzed**: {len(aoh_df[aoh_df.aoh_total > 0])}\n"
+        + f"- **Species with no AOH**: {len(aoh_df[aoh_df.aoh_total == 0])}\n"
+        + f"- **Total outliers detected**: {len(outliers)}\n\n"
+        + "## Species by Taxonomic Class\n"
+    )
+
+    # Count species by class
+    class_counts = aoh_df.groupby('class_name').size().to_dict()
+    outlier_counts = outliers.groupby('class_name').size().to_dict()
+
+    for class_name in sorted(class_counts.keys()):
+        total = class_counts.get(class_name, 0)
+        outlier_count = outlier_counts.get(class_name, 0)
+        outlier_pct = (outlier_count / total * 100) if total > 0 else 0
+        summary_content += f"- **{class_name}**: {total} species, {outlier_count} outliers ({outlier_pct:.1f}%)\n"
+
+    return summary_content
+
+def add_diagnostic_columns(
+    klass_df: pd.DataFrame,
+    upper_fence: float,
+    lower_fence: float
+) -> pd.DataFrame:
+    # Calculate class means for comparison
+    klass_means = klass_df[['elevation_rangekm', 'elevation_midkm', 'n_habitats', 'prevalence']].mean()
+
+    # Outlier flags and type
+    klass_df['outlier_type'] = 'normal'
+    klass_df.loc[klass_df.fit_diff < lower_fence, 'outlier_type'] = 'over-predicted'
+    klass_df.loc[klass_df.fit_diff > upper_fence, 'outlier_type'] = 'under-predicted'
+
+    # Human-readable explanation
+    klass_df['explanation'] = 'Within normal range'
+    klass_df.loc[klass_df.outlier_type == 'under-predicted', 'explanation'] = (
+        'Observed prevalence (' + klass_df['prevalence'].round(3).astype(str) +
+        ') much higher than predicted (' + klass_df['fit'].round(3).astype(str) + ')'
+    )
+    klass_df.loc[klass_df.outlier_type == 'over-predicted', 'explanation'] = (
+        'Observed prevalence (' + klass_df['prevalence'].round(3).astype(str) +
+        ') much lower than predicted (' + klass_df['fit'].round(3).astype(str) + ')'
+    )
+
+    # Context comparison - percentage difference from class mean
+    klass_df['elevation_range_vs_class_mean'] = (
+        ((klass_df['elevation_rangekm'] - klass_means['elevation_rangekm']) /
+         klass_means['elevation_rangekm'] * 100).round(1).astype(str) + '%'
+    )
+    klass_df['elevation_mid_vs_class_mean'] = (
+        ((klass_df['elevation_midkm'] - klass_means['elevation_midkm']) /
+         klass_means['elevation_midkm'] * 100).round(1).astype(str) + '%'
+    )
+    klass_df['n_habitats_vs_class_mean'] = (
+        ((klass_df['n_habitats'] - klass_means['n_habitats']) /
+         klass_means['n_habitats'] * 100).round(1).astype(str) + '%'
+    )
+    klass_df['prevalence_vs_class_mean'] = (
+        ((klass_df['prevalence'] - klass_means['prevalence']) /
+         klass_means['prevalence'] * 100).round(1).astype(str) + '%'
+    )
+
+    return klass_df
 
-    # Ger rid of any where we had no AoH
-    aoh_df = aoh_df[aoh_df.prevalence > 0]
+def extract_model_coefficients(model: "pymer4.models.Lmer", class_name: str) -> pd.DataFrame:
+    coef_df = model.coefs.copy()
+    # Normalize to have explicit variable column for easier downstream pivoting
+    coef_df = coef_df.reset_index().rename(columns={'index': 'variable'})
+    coef_df['class_name'] = class_name
+    return coef_df
+
+def extract_random_effects(model: "pymer4.models.Lmer", class_name: str) -> pd.DataFrame:
+    ranef_df = model.ranef.copy()
+    ranef_df['class_name'] = class_name
+    ranef_df = ranef_df.reset_index()
+    # pymer4 uses 'X.Intercept.' as the column name for random intercepts
+    intercept_col = [col for col in ranef_df.columns if 'Intercept' in col][0]
+    ranef_df = ranef_df.rename(columns={'index': 'family_name', intercept_col: 'random_effect'})
+    return ranef_df
+
+def add_predictors_to_aoh_df(aoh_df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate and standardize predictor variables."""
 
     aoh_df['elevation_range'] = aoh_df['elevation_upper'] - aoh_df['elevation_lower']
     aoh_df['elevation_mid'] = (aoh_df['elevation_upper'] + aoh_df['elevation_lower']) / 2
@@ -35,12 +112,29 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
     aoh_df['std_n_habitats'] = (aoh_df.n_habitats - means.n_habitats) \
         / standard_devs.n_habitats
 
-    per_class_df = []
+    return aoh_df
+
+def model_validation(aoh_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    if pymer4 is None:
+        raise ImportError("pymer4 is required for model validation but not installed. "
+                         "This requires R to be installed on the system.")
+
+    # Get rid of any where we had no AoH
+    aoh_df = aoh_df[aoh_df.prevalence > 0].copy()
+
+    # Prepare predictor variables
+    aoh_df = add_predictors_to_aoh_df(aoh_df)
 
+    # Get unique taxonomic classes
     klasses = aoh_df.class_name.unique()
     if len(klasses) == 0:
         raise ValueError("No species classes were found")
 
+    # Fit models for each class
+    per_class_outliers_df = []
+    per_class_model_coefficients = []
+    per_class_random_effects = []
+
     for klass in klasses:
         klass_df = aoh_df[aoh_df.class_name == klass].copy()
         print(f"{klass}:\n\taohs: {len(klass_df)}")
@@ -56,21 +150,46 @@ def model_validation(aoh_df: pd.DataFrame) -> pd.DataFrame:
         q1 = klass_df.fit_diff.quantile(q=0.25)
         q3 = klass_df.fit_diff.quantile(q=0.75)
         iqr = q3 - q1
+        lower_fence = q1 - (1.5 * iqr)
+        upper_fence = q3 + (1.5 * iqr)
 
-        klass_df['outlier'] = (klass_df.fit_diff > q3 + (1.5 * iqr))  | (klass_df.fit_diff < (q1 - (1.5 * iqr)))
+        klass_df['outlier'] = (klass_df.fit_diff > upper_fence )  | (klass_df.fit_diff < lower_fence )
+        klass_df = add_diagnostic_columns(klass_df, upper_fence, lower_fence)
         klass_outliers = klass_df[klass_df.outlier == True]  # pylint: disable = C0121
         print(f"\toutliers: {len(klass_outliers)}")
-        per_class_df.append(klass_outliers)
+        per_class_outliers_df.append(klass_outliers)
+
+        coef_df = extract_model_coefficients(model, klass)
+        per_class_model_coefficients.append(coef_df)
+
+        ranef_df = extract_random_effects(model, klass)
+        per_class_random_effects.append(ranef_df)
+
+    # Concatenate results
+    outliers_df = pd.concat(per_class_outliers_df)  # type: ignore[arg-type]
+    model_coefficients_df = pd.concat(per_class_model_coefficients)  # type: ignore[arg-type]
+    random_effects_df = pd.concat(per_class_random_effects)  # type: ignore[arg-type]
 
-    return pd.concat(per_class_df)  # type: ignore[no-any-return]
+    return outliers_df, model_coefficients_df, random_effects_df
 
 def validate_map_prevalence(
     collated_data_path: Path,
     output_path: Path,
 ) -> None:
     aoh_df = pd.read_csv(collated_data_path)
-    outliers = model_validation(aoh_df)
-    outliers.to_csv(output_path)
+    outliers, model_coefficients, random_effects = model_validation(aoh_df)
+    outliers.to_csv(output_path, index=False)
+
+    # Save useful model diagnostic files
+    output_dir = output_path.parent
+    aoh_df[aoh_df.aoh_total == 0].to_csv(output_dir / "species_with_no_aoh.csv", index=False)
+    model_coefficients.pivot(
+        index='class_name', columns='variable', values='Estimate'
+    ).to_csv(output_dir / "model_coefficients.csv", index=True)
+    random_effects.to_csv(output_dir / "random_effects.csv", index=False)
+    with open(output_dir / "summary.md", 'w', encoding='utf-8') as f:
+        summary_content = generate_results_summary(aoh_df, outliers)
+        f.write(summary_content)
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Validate map prevalence.")