Add feature importance analysis and unit tests for filter script

wyim-pgl · wyim-pgl · commit 1388abad96e6 · 2025-12-03T20:37:45.000-08:00
diff --git a/README.md b/README.md
@@ -227,6 +227,24 @@ sbatch -A [account] -p [partition] -c 1 --mem=4g \
 
 **Output:** `results/FILTER/filter.gff3`
 
+### Feature Importance Test
+
+Reviewers often ask for an ablation study of the semi-supervised filter. After a
+filter run completes (which produces `FILTER/data.tsv`), launch the automated
+leave-one-feature-out test:
+
+```bash
+python bin/filter_feature_importance.py FILTER/data.tsv results/busco/full_table.tsv \
+  --output-table FILTER/feature_importance.tsv
+```
+
+The script reuses `Filter.semiSupRandomForest`, trains a baseline model with all
+features, and then retrains while removing each feature individually. The final
+out-of-bag error deltas are written to `FILTER/feature_importance.tsv` (and
+`FILTER/feature_importance.json`). Use `--features` to restrict the analysis to a
+subset of columns or `--ignore` to drop metadata columns that should never be
+used as predictors.
+
 ## Configuration
 
 Sylvan uses two separate configuration files:
diff --git a/Wiki.md b/Wiki.md
@@ -560,3 +560,42 @@ python bin/generate_cluster_from_config.py   --config toydata/config/config_anno
 ```
 chmod 775 bin/generate_cluster_from_config.py
 ```
+
+## Feature Importance Analysis
+
+After finishing the filter phase you will have `FILTER/data.tsv` (the feature
+matrix used by `Filter.py`) and a BUSCO run directory such as
+`results/busco/eudicots_odb10`. Reviewers often ask for a feature ablation
+study, so we provide an automated helper:
+
+```bash
+python bin/filter_feature_importance.py FILTER/data.tsv results/busco/<lineage>/full_table.tsv \
+  --output-table FILTER/feature_importance.tsv
+```
+
+- **What is the BUSCO full table?** Every BUSCO run writes a
+  `full_table.tsv` inside its lineage-specific run folder. Each non-Missing
+  BUSCO row lists the BUSCO ID, status (Complete/Duplicated/Fragmented), and the
+  transcript/gene ID it matched. The feature-importance script reuses this file
+  to count how many BUSCOs remain in the “keep” set during each iteration—no new
+  BUSCO analysis is required.
+- **Outputs**: `FILTER/feature_importance.tsv` (table) plus
+  `FILTER/feature_importance.json` (machine-readable). Both include the baseline
+  run (all features) and each leave-one-feature-out run, along with final
+  out-of-bag (OOB) error, BUSCO counts, and iteration counts.
+- **Optional flags**:
+  - `--features TPM COVERAGE PFAM ...` restricts the analysis to specific
+    columns from `FILTER/data.tsv`.
+  - `--ignore TPM_missing singleExon` removes metadata columns so the script
+    automatically uses every other feature column.
+
+Workflow summary:
+
+1. Run `Filter.py` as usual to create `FILTER/data.tsv`.
+2. Identify the BUSCO `full_table.tsv` path you already used for filter
+   monitoring (e.g., `results/busco/eudicots_odb10/full_table.tsv`).
+3. Execute the command above. Inspect `FILTER/feature_importance.tsv` to see how
+   dropping each feature affects OOB error (positive delta ⇒ feature is
+   important).
+4. Incorporate the results (table/plot) into your manuscript or reviewer
+   response.
diff --git a/bin/filter_feature_importance.py b/bin/filter_feature_importance.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+"""Leave-one-feature-out analysis for the semi-supervised filter.
+
+This script reuses the `Filter.semiSupRandomForest` training loop and reruns it
+multiple times while removing one feature at a time. The difference in the
+final out-of-bag (OOB) error provides an intuitive importance score: if dropping
+an evidence track increases the OOB error, that feature contributes useful
+signal to the classifier.
+
+Example:
+    python bin/filter_feature_importance.py FILTER/data.tsv results/busco/run.tsv \
+        --output-table FILTER/feature_importance.tsv
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+from typing import Dict, List
+
+import pandas as pd
+
+from Filter import semiSupRandomForest
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Leave-one-feature-out analysis for the Sylvan filter"
+    )
+    parser.add_argument(
+        "data",
+        help="Path to the TSV created by Filter.filter_genes (e.g. FILTER/data.tsv)",
+    )
+    parser.add_argument(
+        "busco",
+        help=(
+            "Path to the BUSCO table used for monitoring (same input passed to "
+            "Filter.py)."
+        ),
+    )
+    parser.add_argument(
+        "--features",
+        nargs="*",
+        default=None,
+        help=(
+            "Explicit list of feature columns to evaluate. The default uses all "
+            "columns except transcript_id/label and anything listed via --ignore."
+        ),
+    )
+    parser.add_argument(
+        "--ignore",
+        nargs="*",
+        default=[],
+        help="Columns in the data TSV that should never be used as model features.",
+    )
+    parser.add_argument(
+        "--trees",
+        type=int,
+        default=100,
+        help="Number of trees per random forest run (default: 100)",
+    )
+    parser.add_argument(
+        "--predictors",
+        type=int,
+        default=6,
+        help="max_features hyperparameter for RandomForestClassifier (default: 6)",
+    )
+    parser.add_argument(
+        "--max-iter",
+        type=int,
+        default=5,
+        help=(
+            "Maximum number of recycling iterations used by the semi-supervised "
+            "training loop (default: 5)"
+        ),
+    )
+    parser.add_argument(
+        "--recycle",
+        type=float,
+        default=0.95,
+        help=(
+            "Prediction probability threshold required to recycle unlabeled "
+            "examples (default: 0.95)"
+        ),
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="Random seed passed to RandomForestClassifier (default: 123)",
+    )
+    parser.add_argument(
+        "--output-table",
+        default=None,
+        help=(
+            "Output TSV summarizing the baseline run and each leave-one-feature-out "
+            "experiment (default: <data_dir>/feature_importance.tsv)"
+        ),
+    )
+    parser.add_argument(
+        "--output-json",
+        default=None,
+        help=(
+            "Optional JSON file capturing the same summary (default: "
+            "<data_dir>/feature_importance.json)"
+        ),
+    )
+    return parser.parse_args()
+
+
+def resolve_feature_list(df: pd.DataFrame, include: List[str] | None, ignore: List[str]) -> List[str]:
+    """Return the ordered feature list used for training/ablation."""
+    metadata_cols = {"transcript_id", "label"}
+    metadata_cols.update(ignore or [])
+    default_features = [c for c in df.columns if c not in metadata_cols]
+
+    if include:
+        missing = sorted(set(include) - set(default_features))
+        if missing:
+            raise ValueError(
+                f"Requested feature(s) not found in data columns: {', '.join(missing)}"
+            )
+        return include
+
+    return default_features
+
+
+def summarize_process(process: Dict[str, List[float]]) -> Dict[str, float]:
+    """Extract final iteration statistics from the training process log."""
+    def last(seq: List[float]) -> float:
+        if not seq:
+            return float("nan")
+        return seq[-1]
+
+    return {
+        "iterations": len(process.get("kept", [])),
+        "final_kept": last(process.get("kept", [])),
+        "final_discarded": last(process.get("discarded", [])),
+        "final_kept_buscos": last(process.get("kept_buscos", [])),
+        "final_discarded_buscos": last(process.get("discarded_buscos", [])),
+        "final_oob_error": last(process.get("OOB", [])),
+    }
+
+
+def run_filter(
+    data: pd.DataFrame,
+    features: List[str],
+    busco_path: str,
+    args: argparse.Namespace,
+) -> Dict[str, float]:
+    subset_cols = ["transcript_id", "label"] + features
+    subset = data.loc[:, subset_cols].copy()
+    _, process = semiSupRandomForest(
+        subset,
+        args.predictors,
+        busco_path,
+        args.trees,
+        seed=args.seed,
+        recycle_prob=args.recycle,
+        maxiter=args.max_iter,
+    )
+    return summarize_process(process)
+
+
+def format_delta(value: float) -> str:
+    if value is None or math.isnan(value):
+        return "nan"
+    return f"{value:+.4f}"
+
+
+def main() -> None:
+    args = parse_args()
+    data = pd.read_csv(args.data, sep="\t")
+
+    feature_list = resolve_feature_list(data, args.features, args.ignore)
+    if not feature_list:
+        raise ValueError("No usable features detected in data TSV.")
+
+    out_dir = os.path.dirname(os.path.abspath(args.data))
+    table_path = args.output_table or os.path.join(out_dir, "feature_importance.tsv")
+    json_path = args.output_json or os.path.join(out_dir, "feature_importance.json")
+
+    print(f"Running baseline model with {len(feature_list)} features...")
+    baseline = run_filter(data, feature_list, args.busco, args)
+    baseline_row = {
+        "feature_removed": "(none)",
+        "num_features": len(feature_list),
+        "oob_delta": 0.0,
+        **baseline,
+    }
+
+    results = [baseline_row]
+    for feature in feature_list:
+        reduced = [f for f in feature_list if f != feature]
+        if not reduced:
+            continue
+        print(f"Dropping '{feature}' ({len(reduced)} features remaining)...")
+        summary = run_filter(data, reduced, args.busco, args)
+        summary_row = {
+            "feature_removed": feature,
+            "num_features": len(reduced),
+            "oob_delta": summary["final_oob_error"] - baseline["final_oob_error"],
+            **summary,
+        }
+        results.append(summary_row)
+        delta_str = format_delta(summary_row["oob_delta"])
+        print(
+            f"  -> final OOB error: {summary_row['final_oob_error']:.4f} "
+            f"(delta {delta_str})"
+        )
+
+    df = pd.DataFrame(results)
+    df.to_csv(table_path, sep="\t", index=False)
+    print(f"\nSummary written to {table_path}")
+
+    if json_path:
+        json_ready = []
+        for row in results:
+            json_ready.append(
+                {
+                    k: (None if isinstance(v, float) and math.isnan(v) else v)
+                    for k, v in row.items()
+                }
+            )
+        with open(json_path, "w", encoding="utf-8") as fh:
+            json.dump({"runs": json_ready}, fh, indent=2)
+        print(f"JSON summary written to {json_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/test_feature_importance.py b/bin/test_feature_importance.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""Unit test for the filter_feature_importance.py script.
+
+This test simulates the inputs to filter_feature_importance.py to verify that it
+runs and produces outputs in the expected format. It does not check the
+statistical validity of the results, but rather the script's execution and
+output structure.
+"""
+import unittest
+import tempfile
+import shutil
+import os
+import subprocess
+import pandas as pd
+import json
+
+class TestFeatureImportance(unittest.TestCase):
+    def setUp(self):
+        """Set up a temporary directory and dummy input files."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.data_path = os.path.join(self.temp_dir, "data.tsv")
+        self.busco_path = os.path.join(self.temp_dir, "full_table.tsv")
+        self.output_table_path = os.path.join(self.temp_dir, "feature_importance.tsv")
+        self.output_json_path = os.path.join(self.temp_dir, "feature_importance.json")
+
+        # Create dummy data.tsv
+        data = {
+            'transcript_id': [f'tx{i}' for i in range(10)],
+            'label': ['TE', 'Prot', 'BG', 'TE', 'Prot', 'BG', 'TE', 'Prot', 'BG', 'TE'],
+            'feature1': [0.1, 0.9, 0.2, 0.15, 0.85, 0.25, 0.11, 0.92, 0.22, 0.13],
+            'feature2': [0.8, 0.2, 0.7, 0.85, 0.25, 0.75, 0.81, 0.22, 0.72, 0.83],
+            'feature3': [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
+        }
+        pd.DataFrame(data).to_csv(self.data_path, sep='\t', index=False)
+
+        # Create dummy full_table.tsv for BUSCO
+        busco_data = {
+            '# Busco id': [f'busco{i}' for i in range(5)],
+            'Status': ['Complete'] * 5,
+            'Sequence': ['tx1', 'tx4', 'tx7', 'tx0', 'tx3'],
+            'Score': [0.9] * 5,
+            'Length': [100] * 5
+        }
+        with open(self.busco_path, "w") as f:
+            f.write("# Some header lines\n")
+            pd.DataFrame(busco_data).to_csv(f, sep='\t', index=False)
+
+    def tearDown(self):
+        """Clean up the temporary directory."""
+        shutil.rmtree(self.temp_dir)
+
+    def test_script_runs_and_creates_output(self):
+        """Test if the script runs and creates the expected output files."""
+        script_path = os.path.join(os.path.dirname(__file__), 'filter_feature_importance.py')
+        
+        # The script we are testing imports 'Filter' which is in the same directory.
+        # We need to make sure python can find it.
+        env = os.environ.copy()
+        env['PYTHONPATH'] = os.path.dirname(__file__) + os.pathsep + env.get('PYTHONPATH', '')
+
+        cmd = [
+            'python', script_path,
+            self.data_path,
+            self.busco_path,
+            '--output-table', self.output_table_path,
+            '--output-json', self.output_json_path,
+            '--max-iter', '2', # Keep it fast
+        ]
+        
+        result = subprocess.run(cmd, capture_output=True, text=True, env=env)
+        
+        self.assertEqual(result.returncode, 0, f"Script failed with exit code {result.returncode}\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}")
+
+        # Check if output files were created
+        self.assertTrue(os.path.exists(self.output_table_path), "Output table file was not created.")
+        self.assertTrue(os.path.exists(self.output_json_path), "Output json file was not created.")
+
+        # Check the content of the TSV output
+        output_df = pd.read_csv(self.output_table_path, sep='\t')
+        self.assertEqual(len(output_df), 4) # baseline + 3 features
+        expected_columns = [
+            'feature_removed', 'num_features', 'oob_delta', 'iterations', 
+            'final_kept', 'final_discarded', 'final_kept_buscos', 
+            'final_discarded_buscos', 'final_oob_error'
+        ]
+        self.assertListEqual(list(output_df.columns), expected_columns)
+        self.assertEqual(output_df.iloc[0]['feature_removed'], '(none)')
+
+        # Check the content of the JSON output
+        with open(self.output_json_path, 'r') as f:
+            json_data = json.load(f)
+        self.assertIn('runs', json_data)
+        self.assertEqual(len(json_data['runs']), 4)
+        self.assertEqual(json_data['runs'][0]['feature_removed'], '(none)')
+
+
+if __name__ == '__main__':
+    unittest.main()