diff --git a/Dockerfile b/Dockerfile index 5d2c48e..6eb9e3f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,69 +1,35 @@ -# BioAnalyzer Backend Dockerfile FROM python:3.11-slim WORKDIR /app - -# FIX: Make Python recognize the application as a package ENV PYTHONPATH="/app:/app/app" -# Install system dependencies RUN apt-get update && apt-get install -y \ - gcc \ - g++ \ - curl \ - git \ + gcc g++ curl git \ && rm -rf /var/lib/apt/lists/* -# Copy pyproject.toml and README.md first for better caching COPY pyproject.toml README.md ./ -# Upgrade pip and setuptools first RUN pip install --upgrade pip setuptools wheel build -# ------------------------------------------------------------ -# Step 1: Install PyTorch CPU versions (fixed +cpu issue) -# Note: PyTorch CPU versions require special index URL, so we install them separately -# before installing the package from pyproject.toml -# ------------------------------------------------------------ -RUN pip install --no-cache-dir --default-timeout=600 --retries=10 \ - --extra-index-url https://download.pytorch.org/whl/cpu \ - torch==2.1.0+cpu \ - torchvision==0.16.0+cpu \ - torchaudio==2.1.0+cpu +# Install PyTorch CPU wheels +RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu \ + torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu -# ------------------------------------------------------------ -# Step 2: Copy application code -# ------------------------------------------------------------ COPY . . -# ------------------------------------------------------------ -# Step 3: Install the package from pyproject.toml -# This installs the package and all its dependencies from pyproject.toml -# PyTorch is already installed above, so pip will skip it -# Installing in editable mode (-e) ensures entry points are properly installed -# ------------------------------------------------------------ -RUN pip install --no-cache-dir --default-timeout=300 --retries=5 -e . +# Install package + dependencies +RUN pip install --no-cache-dir -e . -# ------------------------------------------------------------ -# Step 4: Install testing dependencies (optional, for development) -# ------------------------------------------------------------ -RUN pip install --no-cache-dir pytest>=7.4.0 pytest-cov>=4.1.0 +# Explicit analysis deps (defensive) +RUN pip install --no-cache-dir pandas scikit-learn matplotlib seaborn -# Create necessary directories RUN mkdir -p cache logs results -# Make CLI executable -RUN chmod +x cli.py +RUN chmod +x cli.py || true +RUN chmod +x scripts/*.py || true -# Expose port EXPOSE 8000 -# Health check -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - -# Set PYTHONPATH for app module imports (fixed nested /app/app issue) -# ENV PYTHONPATH=/app:/app/app +HEALTHCHECK CMD curl -f http://localhost:8000/health || exit 1 -# Default command (can be overridden) CMD ["python", "main.py", "--host", "0.0.0.0", "--port", "8000"] diff --git a/app/utils/config.py b/app/utils/config.py index 797e2cc..706f1aa 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -17,7 +17,15 @@ def load_dotenv(*args: object, **kwargs: object) -> None: # type: ignore[no-red ) -import google.generativeai as genai +def get_genai(): + try: + import google.generativeai as genai + return genai + except ImportError: + raise RuntimeError( + "google-generativeai is not installed. " + "Install with: pip install google-generativeai" + ) possible_env_paths = [ Path(__file__).parents[1] / ".env", # Original location diff --git a/config/requirements.txt b/config/requirements.txt index 7ab6b0a..cbbc783 100644 --- a/config/requirements.txt +++ b/config/requirements.txt @@ -10,8 +10,8 @@ torchaudio>=2.1.0+cpu numpy>=1.26.0 pandas>=2.1.1 scikit-learn>=1.3.0 -matplotlib>=3.7 -seaborn>=0.12 +matplotlib>=3.7.0 +seaborn>=0.12.0 biopython>=1.81 pytz>=2023.3 @@ -29,7 +29,7 @@ paper-qa>=5.0.0 # --- Vector Database --- qdrant-client>=1.7.0 -# --- Web Framework & API (FastAPI/Uvicorn) --- +# --- Web Framework & API --- fastapi>=0.104.0 uvicorn[standard]>=0.23.2 starlette>=0.31.1 @@ -47,19 +47,15 @@ wsproto>=1.0.0 h11>=0.12.0 httptools>=0.3.0 -# --- File Processing (Excel/Env) --- +# --- File Processing --- openpyxl>=3.1.0 xlrd>=2.0.1 python-dotenv>=1.0.0 PyYAML>=5.4.1 aiofiles>=0.7.0 -# --- Utilities & System --- -tqdm>=4.65.0` +# --- Utilities --- +tqdm>=4.65.0 psutil>=5.9.0 click>=8.0.1 watchfiles[watchdog]>=1.0.0 - -# --- Development (Uncomment to install) --- -# pytest>=7.4.0 -# pytest-cov>=4.1.0 \ No newline at end of file diff --git a/confusion_matrix_analysis.py b/confusion_matrix_analysis.py index 0c55795..1e80cb4 100644 --- a/confusion_matrix_analysis.py +++ b/confusion_matrix_analysis.py @@ -1,288 +1,103 @@ #!/usr/bin/env python3 """ -Confusion Matrix and Matthew's Correlation Coefficient Analysis - -This script compares BioAnalyzer predictions (analysis_results.csv) with -curator feedback (feedback.csv) to generate confusion matrices and calculate -MCC for each variable. +Formal validation of BioAnalyzer predictions using +confusion matrices and Matthews Correlation Coefficient (MCC). """ -import pandas as pd +from pathlib import Path +import json import numpy as np +import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix, matthews_corrcoef -from pathlib import Path -import json -from typing import Dict, Tuple, List -# Set style for better-looking plots +CLASSES = ["ABSENT", "PARTIALLY_PRESENT", "PRESENT"] + sns.set_style("whitegrid") -plt.rcParams['figure.figsize'] = (12, 8) +plt.rcParams["figure.figsize"] = (10, 8) -def load_data(analysis_file: str, feedback_file: str) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Load and prepare the analysis results and feedback data.""" - print("Loading data files...") - analysis_df = pd.read_csv(analysis_file) - feedback_df = pd.read_csv(feedback_file) - - print(f"Analysis results: {len(analysis_df)} records") - print(f"Feedback data: {len(feedback_df)} records") - - return analysis_df, feedback_df +def load_data(predictions: Path, feedback: Path): + return pd.read_csv(predictions), pd.read_csv(feedback) -def merge_data(analysis_df: pd.DataFrame, feedback_df: pd.DataFrame) -> pd.DataFrame: - """Merge analysis results with feedback data on PMID.""" - print("\nMerging data on PMID...") - - # Merge on PMID, keeping only records that exist in both - merged = analysis_df.merge( - feedback_df, - on='PMID', - suffixes=('_predicted', '_actual'), - how='inner' - ) - - print(f"Matched records: {len(merged)}") - - if len(merged) == 0: - raise ValueError("No matching PMIDs found between the two files!") - +def merge_on_pmid(pred_df, fb_df): + merged = pred_df.merge(fb_df, on="PMID", suffixes=("_predicted", "_actual")) + if merged.empty: + raise ValueError("No overlapping PMIDs found.") return merged -def calculate_binary_mcc(y_true: np.ndarray, y_pred: np.ndarray) -> float: - """ - Calculate MCC for binary classification (PRESENT vs not PRESENT). - Treats PARTIALLY_PRESENT as a separate category initially. - """ - # Convert to binary: PRESENT = 1, everything else = 0 - y_true_binary = (y_true == 'PRESENT').astype(int) - y_pred_binary = (y_pred == 'PRESENT').astype(int) - - return matthews_corrcoef(y_true_binary, y_pred_binary) +def binary_mcc(y_true, y_pred): + y_true_bin = (y_true == "PRESENT").astype(int) + y_pred_bin = (y_pred == "PRESENT").astype(int) + return matthews_corrcoef(y_true_bin, y_pred_bin) -def calculate_multiclass_mcc(y_true: np.ndarray, y_pred: np.ndarray) -> float: - """ - Calculate MCC for multi-class classification. - Maps PRESENT=2, PARTIALLY_PRESENT=1, ABSENT=0 - """ - status_map = {'ABSENT': 0, 'PARTIALLY_PRESENT': 1, 'PRESENT': 2} - - y_true_mapped = np.array([status_map.get(s, 0) for s in y_true]) - y_pred_mapped = np.array([status_map.get(s, 0) for s in y_pred]) - - return matthews_corrcoef(y_true_mapped, y_pred_mapped) +def multiclass_mcc(y_true, y_pred): + return matthews_corrcoef(y_true, y_pred) -def create_confusion_matrix_data(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, List[str]]: - """Create confusion matrix with all three classes.""" - classes = ['ABSENT', 'PARTIALLY_PRESENT', 'PRESENT'] - - # Filter out any NaN values - mask = ~(pd.isna(y_true) | pd.isna(y_pred)) - y_true_clean = y_true[mask] - y_pred_clean = y_pred[mask] - - cm = confusion_matrix( - y_true_clean, - y_pred_clean, - labels=classes - ) - - return cm, classes - +def analyze_variable(df, variable): + yt = df[f"{variable}_actual"].dropna() + yp = df[f"{variable}_predicted"].dropna() -def plot_confusion_matrix(cm: np.ndarray, classes: List[str], variable_name: str, - save_path: str = None) -> None: - """Plot a confusion matrix with annotations.""" - plt.figure(figsize=(10, 8)) - - # Normalize confusion matrix to show percentages - cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - cm_normalized = np.nan_to_num(cm_normalized) - - # Create heatmap - sns.heatmap( - cm_normalized, - annot=True, - fmt='.2%', - cmap='Blues', - xticklabels=classes, - yticklabels=classes, - cbar_kws={'label': 'Normalized Frequency'}, - annot_kws={'size': 12} - ) - - plt.title(f'Confusion Matrix: {variable_name}\n(Normalized)', fontsize=16, fontweight='bold') - plt.ylabel('Actual (Curator)', fontsize=12) - plt.xlabel('Predicted (BioAnalyzer)', fontsize=12) - plt.tight_layout() - - if save_path: - plt.savefig(save_path, dpi=300, bbox_inches='tight') - print(f" Saved confusion matrix plot: {save_path}") - - plt.close() + mask = yt.index.intersection(yp.index) + yt, yp = yt.loc[mask], yp.loc[mask] + cm = confusion_matrix(yt, yp, labels=CLASSES) -def analyze_variable(merged_df: pd.DataFrame, variable: str) -> Dict: - """Analyze a single variable and return metrics.""" - print(f"\nAnalyzing variable: {variable}") - - pred_col = f"{variable}_predicted" - actual_col = f"{variable}_actual" - - if pred_col not in merged_df.columns or actual_col not in merged_df.columns: - print(f" Warning: Columns not found for {variable}") - return None - - y_pred = merged_df[pred_col].values - y_actual = merged_df[actual_col].values - - # Remove NaN values - mask = ~(pd.isna(y_actual) | pd.isna(y_pred)) - y_pred_clean = y_pred[mask] - y_actual_clean = y_actual[mask] - - if len(y_pred_clean) == 0: - print(f" Warning: No valid data for {variable}") - return None - - # Create confusion matrix - cm, classes = create_confusion_matrix_data(y_actual_clean, y_pred_clean) - - # Calculate MCCs - mcc_binary = calculate_binary_mcc(y_actual_clean, y_pred_clean) - mcc_multiclass = calculate_multiclass_mcc(y_actual_clean, y_pred_clean) - - # Calculate accuracy - accuracy = np.sum(y_actual_clean == y_pred_clean) / len(y_actual_clean) - - # Calculate per-class metrics - results = { - 'variable': variable, - 'confusion_matrix': cm.tolist(), - 'classes': classes, - 'mcc_binary': float(mcc_binary), - 'mcc_multiclass': float(mcc_multiclass), - 'accuracy': float(accuracy), - 'n_samples': int(len(y_pred_clean)), - 'class_distribution_actual': { - cls: int(np.sum(y_actual_clean == cls)) for cls in classes - }, - 'class_distribution_predicted': { - cls: int(np.sum(y_pred_clean == cls)) for cls in classes - } + return { + "variable": variable, + "n": len(yt), + "accuracy": float((yt == yp).mean()), + "mcc_binary": float(binary_mcc(yt, yp)), + "mcc_multiclass": float(multiclass_mcc(yt, yp)), + "confusion_matrix": cm.tolist(), } - - print(f" Samples: {results['n_samples']}") - print(f" Accuracy: {results['accuracy']:.3f}") - print(f" MCC (Binary): {results['mcc_binary']:.3f}") - print(f" MCC (Multi-class): {results['mcc_multiclass']:.3f}") - - return results -def generate_summary_report(all_results: List[Dict], output_file: str) -> None: - """Generate a summary report with all metrics.""" - print("\n" + "="*80) - print("SUMMARY REPORT") - print("="*80) - - # Create summary DataFrame - summary_data = [] - for result in all_results: - if result is None: - continue - summary_data.append({ - 'Variable': result['variable'], - 'N Samples': result['n_samples'], - 'Accuracy': f"{result['accuracy']:.3f}", - 'MCC (Binary)': f"{result['mcc_binary']:.3f}", - 'MCC (Multi-class)': f"{result['mcc_multiclass']:.3f}" - }) - - summary_df = pd.DataFrame(summary_data) - print("\n" + summary_df.to_string(index=False)) - - # Save to CSV - summary_df.to_csv(output_file, index=False) - print(f"\nSummary saved to: {output_file}") +def plot_cm(cm, variable, out): + cmn = cm / cm.sum(axis=1, keepdims=True) + sns.heatmap(cmn, annot=True, fmt=".2%", xticklabels=CLASSES, yticklabels=CLASSES) + plt.title(variable) + plt.xlabel("Predicted") + plt.ylabel("Actual") + plt.tight_layout() + plt.savefig(out, dpi=300) + plt.close() def main(): - """Main analysis function.""" - # File paths - analysis_file = Path("analysis_results.csv") - feedback_file = Path("feedback.csv") - output_dir = Path("confusion_matrix_results") - output_dir.mkdir(exist_ok=True) - - # Variables to analyze + preds, fb = load_data( + Path("analysis_results.csv"), + Path("feedback.csv"), + ) + merged = merge_on_pmid(preds, fb) + + outdir = Path("confusion_matrix_results") + outdir.mkdir(exist_ok=True) + variables = [ - 'Host Species Status', - 'Body Site Status', - 'Condition Status', - 'Sequencing Type Status', - 'Taxa Level Status', - 'Sample Size Status' + "Host Species Status", + "Body Site Status", + "Condition Status", + "Sequencing Type Status", + "Taxa Level Status", + "Sample Size Status", ] - - # Load and merge data - analysis_df, feedback_df = load_data(analysis_file, feedback_file) - merged_df = merge_data(analysis_df, feedback_df) - - # Analyze each variable - all_results = [] - - for variable in variables: - result = analyze_variable(merged_df, variable) - if result: - all_results.append(result) - - # Plot confusion matrix - cm = np.array(result['confusion_matrix']) - plot_path = output_dir / f"confusion_matrix_{variable.replace(' ', '_')}.png" - plot_confusion_matrix(cm, result['classes'], variable, str(plot_path)) - - # Generate summary report - summary_file = output_dir / "summary_metrics.csv" - generate_summary_report(all_results, str(summary_file)) - - # Save detailed results as JSON - json_file = output_dir / "detailed_results.json" - with open(json_file, 'w') as f: - json.dump(all_results, f, indent=2) - print(f"\nDetailed results saved to: {json_file}") - - # Create a comprehensive comparison table - comparison_file = output_dir / "comparison_table.csv" - comparison_data = [] - for result in all_results: - if result is None: - continue - for i, actual_class in enumerate(result['classes']): - for j, pred_class in enumerate(result['classes']): - count = result['confusion_matrix'][i][j] - comparison_data.append({ - 'Variable': result['variable'], - 'Actual': actual_class, - 'Predicted': pred_class, - 'Count': count - }) - - comparison_df = pd.DataFrame(comparison_data) - comparison_df.to_csv(comparison_file, index=False) - print(f"Comparison table saved to: {comparison_file}") - - print("\n" + "="*80) - print("Analysis complete!") - print(f"Results saved in: {output_dir}") - print("="*80) + + results = [] + for v in variables: + r = analyze_variable(merged, v) + results.append(r) + plot_cm(np.array(r["confusion_matrix"]), v, outdir / f"{v.replace(' ', '_')}.png") + + pd.DataFrame(results).to_csv(outdir / "summary_metrics.csv", index=False) + json.dump(results, open(outdir / "detailed_results.json", "w"), indent=2) + + print("Validation complete. Results in:", outdir) if __name__ == "__main__":