CarterPerez-dev
diff --git a/‎PROJECTS/advanced/ai-threat-detection/backend/app/api/health.py‎
Lines changed: 6 additions & 2 deletions b/‎PROJECTS/advanced/ai-threat-detection/backend/app/api/health.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎PROJECTS/advanced/ai-threat-detection/backend/app/api/models_api.py‎
Lines changed: 141 additions & 101 deletions b/‎PROJECTS/advanced/ai-threat-detection/backend/app/api/models_api.py‎
Lines changed: 141 additions & 101 deletions
diff --git a/‎PROJECTS/advanced/ai-threat-detection/backend/app/config.py‎
Lines changed: 22 additions & 2 deletions b/‎PROJECTS/advanced/ai-threat-detection/backend/app/config.py‎
Lines changed: 22 additions & 2 deletions
@@ -5,8 +5,10 @@
 Health and readiness probe endpoints for container
 orchestration
 
-GET /health returns liveness status with uptime_seconds
-and pipeline_running flag. GET /ready checks database
+GET /health returns liveness status with uptime_seconds,
+pipeline_running flag, and per-stage pipeline_stats
+counters (parsed/enriched/scored/dispatched with error
+counts). GET /ready checks database
 connectivity (SELECT 1) and Redis ping, reports
 models_loaded status, and returns 503 if any dependency
 is down. Both endpoints read from app.state set during
@@ -34,10 +36,12 @@ async def health(request: Request) -> dict[str, object]:
     Liveness probe — returns 200 if the process is alive.
     """
     uptime = time.monotonic() - request.app.state.startup_time
+    pipeline = getattr(request.app.state, "pipeline", None)
     return {
         "status": "healthy",
         "uptime_seconds": round(uptime, 2),
         "pipeline_running": request.app.state.pipeline_running,
+        "pipeline_stats": pipeline.stats if pipeline else {},
     }
 
 
 
@@ -6,14 +6,16 @@
 
 GET /models/status returns models_loaded flag, detection
 _mode (hybrid or rules), and active model metadata from
-the database. POST /models/retrain dispatches a
+the database. POST /models/retrain acquires _retrain_lock
+(returning 409 if already running), dispatches a
 background retraining job that loads stored ThreatEvents,
 labels them using review_label or score thresholds
 (SCORE_ATTACK_THRESHOLD 0.5, SCORE_NORMAL_CEILING 0.3),
 supplements with synthetic data if below MIN_TRAINING_
 SAMPLES (200), runs TrainingOrchestrator, and writes
 model metadata. _fallback_synthetic spawns a subprocess
-CLI train command when no real events exist
+CLI train command with lifecycle tracking via
+_synthetic_process
 
 Connects to:
   config.py              - settings.model_dir, ensemble
@@ -25,10 +27,13 @@
   cli/main               - _write_metadata
 """
 
+import asyncio
 import logging
+import subprocess
 import uuid
 
-from fastapi import APIRouter, BackgroundTasks, Request
+from fastapi import APIRouter, BackgroundTasks, Request, Response
+from fastapi.responses import JSONResponse
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 
@@ -46,6 +51,9 @@
 SYNTHETIC_SUPPLEMENT_NORMAL = 500
 SYNTHETIC_SUPPLEMENT_ATTACK = 250
 
+_retrain_lock = asyncio.Lock()
+_synthetic_process: subprocess.Popen[bytes] | None = None
+
 
 @router.get("/status")
 async def model_status(request: Request) -> dict[str, object]:
@@ -68,15 +76,21 @@ async def model_status(request: Request) -> dict[str, object]:
     }
 
 
-@router.post("/retrain", status_code=202)
+@router.post("/retrain", status_code=202, response_model=None)
 async def retrain(
     request: Request,
     background_tasks: BackgroundTasks,
-) -> dict[str, object]:
+) -> dict[str, object] | Response:
     """
     Dispatch a model retraining job using real stored
     threat events supplemented with synthetic data
     """
+    if _retrain_lock.locked():
+        return JSONResponse(
+            status_code=409,
+            content={"status": "conflict", "job_id": ""},
+        )
+
     session_factory = getattr(request.app.state, "session_factory", None)
     if session_factory is None:
         return {"status": "error", "job_id": ""}
@@ -99,129 +113,155 @@ async def _retrain_from_db(
     supplement with synthetic data if needed, and run
     the full training pipeline
     """
-    import asyncio
     import dataclasses
     from pathlib import Path
 
     import numpy as np
 
     from ml.orchestrator import TrainingOrchestrator
 
-    logger.info("Retrain job %s: loading stored events", job_id)
-
-    async with session_factory() as session:
-        count = (await session.execute(
-            select(func.count()).select_from(ThreatEvent)
-        )).scalar_one()
-
-        if count == 0:
-            logger.warning(
-                "Retrain job %s: no stored events, using synthetic only",
-                job_id,
-            )
-            _fallback_synthetic(job_id)
-            return
+    async with _retrain_lock:
+        logger.info("Retrain job %s: loading stored events", job_id)
 
-        rows = (await session.execute(
-            select(ThreatEvent)
-        )).scalars().all()
+        async with session_factory() as session:
+            count = (await session.execute(
+                select(func.count()).select_from(ThreatEvent)
+            )).scalar_one()
+
+            if count == 0:
+                logger.warning(
+                    "Retrain job %s: no stored events, "
+                    "using synthetic only",
+                    job_id,
+                )
+                _fallback_synthetic(job_id)
+                return
+
+            rows = (await session.execute(
+                select(ThreatEvent)
+            )).scalars().all()
+
+        vectors: list[list[float]] = []
+        labels: list[int] = []
+
+        for event in rows:
+            if not event.feature_vector:
+                continue
+
+            if event.reviewed and event.review_label:
+                label = (
+                    1 if event.review_label == "true_positive"
+                    else 0
+                )
+            elif event.threat_score >= SCORE_ATTACK_THRESHOLD:
+                label = 1
+            elif event.threat_score < SCORE_NORMAL_CEILING:
+                label = 0
+            else:
+                continue
+
+            vectors.append(event.feature_vector)
+            labels.append(label)
 
-    vectors: list[list[float]] = []
-    labels: list[int] = []
+        logger.info(
+            "Retrain job %s: %d usable events from DB "
+            "(normal=%d, attack=%d)",
+            job_id,
+            len(vectors),
+            labels.count(0),
+            labels.count(1),
+        )
 
-    for event in rows:
-        if not event.feature_vector:
-            continue
+        from ml.synthetic import generate_mixed_dataset
 
-        if event.reviewed and event.review_label:
-            label = 1 if event.review_label == "true_positive" else 0
-        elif event.threat_score >= SCORE_ATTACK_THRESHOLD:
-            label = 1
-        elif event.threat_score < SCORE_NORMAL_CEILING:
-            label = 0
+        if len(vectors) < MIN_TRAINING_SAMPLES:
+            syn_X, syn_y = generate_mixed_dataset(
+                SYNTHETIC_SUPPLEMENT_NORMAL,
+                SYNTHETIC_SUPPLEMENT_ATTACK,
+            )
+            X = np.concatenate([
+                np.array(vectors, dtype=np.float32),
+                syn_X,
+            ]) if vectors else syn_X
+            y = np.concatenate([
+                np.array(labels, dtype=np.int32),
+                syn_y,
+            ]) if labels else syn_y
+            logger.info(
+                "Retrain job %s: supplemented with "
+                "%d synthetic samples",
+                job_id,
+                len(syn_X),
+            )
         else:
-            continue
-
-        vectors.append(event.feature_vector)
-        labels.append(label)
-
-    logger.info(
-        "Retrain job %s: %d usable events from DB "
-        "(normal=%d, attack=%d)",
-        job_id,
-        len(vectors),
-        labels.count(0),
-        labels.count(1),
-    )
-
-    from ml.synthetic import generate_mixed_dataset
-
-    if len(vectors) < MIN_TRAINING_SAMPLES:
-        syn_X, syn_y = generate_mixed_dataset(
-            SYNTHETIC_SUPPLEMENT_NORMAL,
-            SYNTHETIC_SUPPLEMENT_ATTACK,
+            X = np.array(vectors, dtype=np.float32)
+            y = np.array(labels, dtype=np.int32)
+
+        output_dir = Path(settings.model_dir)
+        loop = asyncio.get_running_loop()
+        result = await loop.run_in_executor(
+            None,
+            lambda: TrainingOrchestrator(
+                output_dir=output_dir,
+            ).run(X, y),
         )
-        X = np.concatenate([
-            np.array(vectors, dtype=np.float32),
-            syn_X,
-        ]) if vectors else syn_X
-        y = np.concatenate([
-            np.array(labels, dtype=np.int32),
-            syn_y,
-        ]) if labels else syn_y
+
         logger.info(
-            "Retrain job %s: supplemented with %d synthetic samples",
+            "Retrain job %s complete: passed_gates=%s",
             job_id,
-            len(syn_X),
+            result.passed_gates,
         )
-    else:
-        X = np.array(vectors, dtype=np.float32)
-        y = np.array(labels, dtype=np.int32)
-
-    output_dir = Path(settings.model_dir)
-    loop = asyncio.get_running_loop()
-    result = await loop.run_in_executor(
-        None,
-        lambda: TrainingOrchestrator(output_dir=output_dir).run(X, y),
-    )
-
-    logger.info(
-        "Retrain job %s complete: passed_gates=%s",
-        job_id,
-        result.passed_gates,
-    )
 
-    try:
-        from cli.main import _write_metadata
+        try:
+            from cli.main import _write_metadata
 
-        metrics: dict[str, object] = (
-            dataclasses.asdict(result.ensemble_metrics)
-            if result.ensemble_metrics else {}
-        )
-        await _write_metadata(
-            output_dir,
-            len(X),
-            metrics,
-            result.mlflow_run_id,
-            result.ae_metrics.get("ae_threshold"),
-        )
-    except Exception:
-        logger.exception(
-            "Retrain job %s: failed to write metadata",
-            job_id,
-        )
+            metrics: dict[str, object] = (
+                dataclasses.asdict(result.ensemble_metrics)
+                if result.ensemble_metrics else {}
+            )
+            await _write_metadata(
+                output_dir,
+                len(X),
+                metrics,
+                result.mlflow_run_id,
+                result.ae_metrics.get("ae_threshold"),
+            )
+        except Exception:
+            logger.exception(
+                "Retrain job %s: failed to write metadata",
+                job_id,
+            )
 
 
 def _fallback_synthetic(job_id: str) -> None:
     """
     Run training with synthetic data only when no real
     events exist
     """
-    import subprocess
+    global _synthetic_process  # noqa: PLW0603
     import sys
 
-    logger.info("Retrain job %s: falling back to synthetic training", job_id)
-    subprocess.Popen(
+    if _synthetic_process is not None:
+        if _synthetic_process.poll() is None:
+            logger.info(
+                "Retrain job %s: synthetic training already "
+                "running (pid=%d)",
+                job_id,
+                _synthetic_process.pid,
+            )
+            return
+        rc = _synthetic_process.returncode
+        if rc != 0:
+            logger.warning(
+                "Previous synthetic training exited with %d",
+                rc,
+            )
+
+    logger.info(
+        "Retrain job %s: falling back to synthetic training",
+        job_id,
+    )
+    _synthetic_process = subprocess.Popen(
         [
             sys.executable,
             "-m",
 
@@ -13,8 +13,9 @@
 settings (size 32, timeout 50ms), and ML configuration
 (model_dir, detection_mode, ensemble weights for
 autoencoder/random-forest/isolation-forest at 0.40/0.40
-/0.20, ae_threshold_percentile 99.5, MLflow tracking
-URI). Exports a module-level singleton settings instance
+/0.20 with model_validator enforcing sum-to-1.0,
+ae_threshold_percentile 99.5, MLflow tracking URI).
+Exports a module-level singleton settings instance
 
 Connects to:
   factory.py        - consumed in lifespan and create_app
@@ -24,6 +25,9 @@
   core/enrichment/  - geoip_db_path
 """
 
+from typing import Self
+
+from pydantic import model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
@@ -70,5 +74,21 @@ class Settings(BaseSettings):
     ae_threshold_percentile: float = 99.5
     mlflow_tracking_uri: str = "file:./mlruns"
 
+    @model_validator(mode="after")
+    def _check_ensemble_weights(self) -> Self:
+        """
+        Validate that ensemble weights sum to 1.0
+        """
+        total = (
+            self.ensemble_weight_ae
+            + self.ensemble_weight_rf
+            + self.ensemble_weight_if
+        )
+        if abs(total - 1.0) > 1e-6:
+            raise ValueError(
+                f"Ensemble weights must sum to 1.0, got {total:.6f}"
+            )
+        return self
+
 
 settings = Settings()