evalops
diff --git a/‎README.md‎
Lines changed: 5 additions & 4 deletions b/‎README.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎app/models/analytics.py‎
Lines changed: 28 additions & 0 deletions b/‎app/models/analytics.py‎
Lines changed: 28 additions & 0 deletions
@@ -98,7 +98,8 @@ Copy `.env.example` to `.env` and adjust values locally if you prefer dotenv-sty
 - The JSON results are mapped back to the originating changed lines so findings retain repo/PR/file/line attribution.
 - Extend the rule pack or point the detector at your organization-wide Semgrep registry by updating `SemgrepDetector` in `app/services/detection.py`.
 - Register additional detectors by providing module paths in `PROVENANCE_DETECTOR_MODULE_PATHS`; each module should expose `register_detectors()` returning `BaseDetector` instances.
-- When GitHub credentials are configured, the service automatically inspects commit trailers, PR labels, review comments, and reviewer identities to fill missing agent attribution (see `app/provenance/github_resolver.py`).
+- When GitHub credentials are configured, the service automatically inspects commit trailers, PR labels, review comments, reviewer identities, and PR timelines to fill missing agent attribution and capture structured evidence (see `app/provenance/github_resolver.py`).
+- The resolver also persists PR conversations (thread counts, classification breakdowns, agent response latency), CI outcomes (time-to-green, failed checks), and commit/timeline summaries (force pushes, human follow-ups, rewrite loops) so analytics can surface behavioral signals without re-calling the GitHub API.
 - Built-in heuristics now include a Python import detector that flags risky modules (e.g., `subprocess`, `pickle`); extend this pattern with your own detectors via modular hooks.
 
 ## API Surface
@@ -146,9 +147,9 @@ Example ingestion payload:
 
 ## Agent Insights & Analytics
 
-- `/v1/analytics/summary` now supports additional metrics: `code_volume`, `code_churn_rate`, and `avg_line_complexity` in addition to `risk_rate` and `provenance_coverage`.
-- `/v1/analytics/agents/behavior` returns composite snapshots (volume, churn rate, heuristic complexity, and top vulnerability categories per agent) to power comparison dashboards.
-- Review-focused metrics (`review_comments`, `unique_reviewers`) leverage GitHub PR data when credentials are supplied.
+- `/v1/analytics/summary` now surfaces GitHub-aware metrics alongside the existing risk/volume suite: `code_volume`, `code_churn_rate`, `avg_line_complexity`, `agent_response_rate`, `agent_response_p50_hours`, `agent_response_p90_hours`, `reopened_threads`, `force_push_events`, `rewrite_loops`, `human_followup_commits`, `ci_time_to_green_hours`, `ci_failed_checks`, `agent_commit_ratio`, `commit_lead_time_hours`, and `classification_<label>_count` (e.g., `classification_security_count`).
+- `/v1/analytics/agents/behavior` returns composite snapshots that now blend code/finding metrics with review conversation health (thread counts, response latency, classification breakdowns), CI friction (failures, time-to-green), commit dynamics (force pushes, rewrite loops, human follow-ups), and attention heatmaps (top paths + hot files) per agent.
+- Review-focused metrics (`review_comments`, `unique_reviewers`, `review_events`, `agent_comment_mentions`) continue to leverage GitHub PR data when credentials are supplied; classification metrics reflect the resolver's heuristic labeling of each conversation snippet.
 - Use `PROVENANCE_ANALYTICS_DEFAULT_WINDOW` or query parameters such as `?time_window=14d` to track longer horizons and compare agents.
 
 ## Telemetry Export
 
@@ -54,6 +54,34 @@ class AgentBehaviorSnapshot(BaseModel):
     unique_reviewers: int = Field(0, description="Unique reviewer count across associated PRs.")
     review_events: int = Field(0, description="Total review submissions across associated PRs.")
     agent_comment_mentions: int = Field(0, description="Count of agent markers found in review comments.")
+    comment_threads: int = Field(0, description="Distinct review discussion threads observed.")
+    reopened_threads: int = Field(0, description="Threads where reviewers re-engaged after an agent response.")
+    agent_response_rate: float = Field(0.0, description="Share of threads with an agent response.")
+    agent_response_p50_hours: Optional[float] = Field(
+        None, description="Median response time (hours) between reviewer comment and agent reply."
+    )
+    agent_response_p90_hours: Optional[float] = Field(
+        None, description="P90 response time (hours) between reviewer comment and agent reply."
+    )
+    classification_breakdown: dict[str, int] = Field(
+        default_factory=dict, description="Aggregate comment/review classifications (security, nit, etc.)."
+    )
+    ci_run_count: int = Field(0, description="Number of CI runs/checks evaluated.")
+    ci_failure_count: int = Field(0, description="Number of failing CI runs/checks.")
+    ci_failed_checks: int = Field(0, description="Unique failing CI checks in the window.")
+    ci_time_to_green_hours: Optional[float] = Field(None, description="Median time-to-green across CI runs.")
+    ci_latest_status: Optional[str] = Field(None, description="Most recent CI rollup status observed.")
+    force_push_events: int = Field(0, description="Force-push events recorded on associated PRs.")
+    rewrite_loops: int = Field(0, description="Follow-up human commits arriving within 48h of agent commits.")
+    human_followup_commits: int = Field(0, description="Count of human commits landing immediately after agent commits.")
+    agent_commit_ratio: float = Field(0.0, description="Share of commits authored by the agent.")
+    commit_lead_time_hours: Optional[float] = Field(None, description="Lead time between earliest and latest commits.")
+    top_paths: dict[str, int] = Field(
+        default_factory=dict, description="Most frequently modified top-level paths for the agent."
+    )
+    hot_files: list[str] = Field(
+        default_factory=list, description="Files touched repeatedly (>=3 times) signalling attention hot-spots."
+    )
 
 
 class AgentBehaviorReport(BaseModel):