Skip to content

Commit a6d6075

Browse files
committed
feat: enrich github provenance evidence and review metrics
1 parent 07f445b commit a6d6075

File tree

6 files changed

+79
-28
lines changed

6 files changed

+79
-28
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ Copy `.env.example` to `.env` and adjust values locally if you prefer dotenv-sty
8383
| `PROVENANCE_GITHUB_BASE_URL` | GitHub enterprise base URL (optional) | *(unset)* |
8484
| `PROVENANCE_GITHUB_AGENT_LABEL_PREFIX` | PR label prefix used to infer agent IDs | `agent:` |
8585
| `PROVENANCE_GITHUB_CACHE_TTL_SECONDS` | Cache TTL (seconds) for GitHub metadata lookups | `300` |
86+
| `PROVENANCE_GITHUB_AGENT_MAP` | JSON map of GitHub logins/keywords to agent IDs | `{}` |
8687

8788
## Detection with Semgrep
8889

@@ -147,6 +148,7 @@ Example ingestion payload:
147148

148149
- `/v1/analytics/summary` now supports additional metrics: `code_volume`, `code_churn_rate`, and `avg_line_complexity` in addition to `risk_rate` and `provenance_coverage`.
149150
- `/v1/analytics/agents/behavior` returns composite snapshots (volume, churn rate, heuristic complexity, and top vulnerability categories per agent) to power comparison dashboards.
151+
- Review-focused metrics (`review_comments`, `unique_reviewers`) leverage GitHub PR data when credentials are supplied.
150152
- Use `PROVENANCE_ANALYTICS_DEFAULT_WINDOW` or query parameters such as `?time_window=14d` to track longer horizons and compare agents.
151153

152154
## Telemetry Export

app/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class Settings(BaseSettings):
4040
github_base_url: str | None = None
4141
github_agent_label_prefix: str = "agent:"
4242
github_cache_ttl_seconds: int = 300
43+
github_agent_map: dict[str, str] = Field(default_factory=dict)
4344

4445
model_config = SettingsConfigDict(env_prefix="provenance_", env_file=".env", extra="ignore")
4546

app/dependencies.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def get_github_resolver() -> GitHubProvenanceResolver | None:
5555
base_url=settings.github_base_url,
5656
agent_label_prefix=settings.github_agent_label_prefix,
5757
cache_ttl_seconds=settings.github_cache_ttl_seconds,
58+
agent_map=settings.github_agent_map,
5859
)
5960

6061

app/provenance/github_resolver.py

Lines changed: 62 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(
2323
base_url: str | None = None,
2424
agent_label_prefix: str = "agent:",
2525
cache_ttl_seconds: int = 300,
26+
agent_map: dict[str, str] | None = None,
2627
) -> None:
2728
self._agent_label_prefix = agent_label_prefix.lower()
2829
auth = Token(token)
@@ -31,6 +32,7 @@ def __init__(
3132
else:
3233
self._client = Github(auth=auth)
3334
self._cache_ttl = max(cache_ttl_seconds, 30)
35+
self._agent_map = {k.lower(): v for k, v in (agent_map or {}).items()}
3436
self._commit_cache: dict[tuple[str, str], tuple[float, Optional[Commit.Commit]]] = {}
3537
self._label_cache: dict[tuple[str, int], tuple[float, list[str]]] = {}
3638
self._comment_cache: dict[tuple[str, int], tuple[float, list[str]]] = {}
@@ -42,17 +44,31 @@ def resolve_agent(
4244
repo_full_name: str,
4345
pr_number: str | None,
4446
commit_sha: str | None,
45-
) -> tuple[Optional[str], Optional[str]]:
47+
) -> tuple[Optional[str], Optional[str], dict]:
4648
agent_id: Optional[str] = None
4749
session_id: Optional[str] = None
50+
evidence: dict = {}
4851

4952
if commit_sha:
50-
agent_id, session_id = self._from_commit(repo_full_name, commit_sha)
53+
agent_id, session_id, commit_evidence = self._from_commit(repo_full_name, commit_sha)
54+
evidence.setdefault("sources", []).append(commit_evidence)
5155
if not agent_id and pr_number:
52-
agent_id = self._from_pr_labels(repo_full_name, int(pr_number))
56+
label_agent, label_evidence = self._from_pr_labels(repo_full_name, int(pr_number))
57+
if label_agent:
58+
agent_id = label_agent
59+
evidence.setdefault("sources", []).append(label_evidence)
5360
if not agent_id and pr_number:
54-
agent_id = self._from_pr_discussion(repo_full_name, int(pr_number))
55-
return agent_id, session_id
61+
discussion_agent, discussion_evidence = self._from_pr_discussion(repo_full_name, int(pr_number))
62+
if discussion_agent:
63+
agent_id = discussion_agent
64+
evidence.setdefault("sources", []).append(discussion_evidence)
65+
if not agent_id and pr_number:
66+
body_agent, body_evidence = self._from_pr_body(repo_full_name, int(pr_number))
67+
if body_agent:
68+
agent_id = body_agent
69+
evidence.setdefault("sources", []).append(body_evidence)
70+
evidence["agent_id"] = agent_id
71+
return agent_id, session_id, evidence
5672

5773
def review_stats(self, repo_full_name: str, pr_number: int) -> dict[str, int] | None:
5874
comments = self._fetch_pr_comments(repo_full_name, pr_number)
@@ -82,23 +98,24 @@ def _fetch_commit(self, repo_full_name: str, sha: str) -> Optional[Commit.Commit
8298
self._commit_cache[key] = (now + self._cache_ttl, commit)
8399
return commit
84100

85-
def _from_commit(self, repo_full_name: str, sha: str) -> tuple[Optional[str], Optional[str]]:
101+
def _from_commit(self, repo_full_name: str, sha: str) -> tuple[Optional[str], Optional[str], dict]:
86102
commit = self._fetch_commit(repo_full_name, sha)
87103
if not commit:
88-
return None, None
104+
return None, None, {"source": "commit", "reason": "not_found"}
89105
message = commit.commit.message or ""
90106
for line in message.splitlines():
91107
match = AGENT_TRAILER_PATTERN.match(line.strip())
92108
if match:
93-
return match.group("agent"), None
109+
return match.group("agent"), None, {"source": "commit_trailer", "line": line.strip()}
94110
for line in message.splitlines():
95111
match = CO_AUTHOR_PATTERN.match(line.strip())
96112
if match and "copilot" in match.group("author").lower():
97-
return "github-copilot", None
113+
return "github-copilot", None, {"source": "co_author", "value": match.group("author")}
98114
author_login = getattr(commit.author, "login", "") or ""
99115
if author_login:
100-
return author_login, None
101-
return None, None
116+
mapped = self._agent_map.get(author_login.lower())
117+
return mapped or author_login, None, {"source": "commit_author", "value": author_login}
118+
return None, None, {"source": "commit", "reason": "no_author"}
102119

103120
def _fetch_pr_labels(self, repo_full_name: str, pr_number: int) -> list[str]:
104121
key = (repo_full_name, pr_number)
@@ -115,12 +132,16 @@ def _fetch_pr_labels(self, repo_full_name: str, pr_number: int) -> list[str]:
115132
self._label_cache[key] = (now + self._cache_ttl, labels)
116133
return labels
117134

118-
def _from_pr_labels(self, repo_full_name: str, pr_number: int) -> Optional[str]:
119-
for label in self._fetch_pr_labels(repo_full_name, pr_number):
135+
def _from_pr_labels(self, repo_full_name: str, pr_number: int) -> tuple[Optional[str], dict]:
136+
labels = self._fetch_pr_labels(repo_full_name, pr_number)
137+
for label in labels:
120138
lower = label.lower()
121139
if lower.startswith(self._agent_label_prefix):
122-
return label.split(":", 1)[-1].strip()
123-
return None
140+
return label.split(":", 1)[-1].strip(), {"source": "label", "label": label}
141+
mapped = self._agent_map.get(lower)
142+
if mapped:
143+
return mapped, {"source": "label_map", "label": label}
144+
return None, {"source": "label", "labels": labels}
124145

125146
def _fetch_pr_comments(self, repo_full_name: str, pr_number: int) -> list[str]:
126147
key = (repo_full_name, pr_number)
@@ -168,18 +189,38 @@ def _fetch_review_events(self, repo_full_name: str, pr_number: int) -> int:
168189
self._review_event_cache[key] = (now + self._cache_ttl, events)
169190
return events
170191

171-
def _from_pr_discussion(self, repo_full_name: str, pr_number: int) -> Optional[str]:
192+
def _from_pr_discussion(self, repo_full_name: str, pr_number: int) -> tuple[Optional[str], dict]:
172193
for body in self._fetch_pr_comments(repo_full_name, pr_number):
173194
for line in body.splitlines():
174195
match = AGENT_TRAILER_PATTERN.match(line.strip())
175196
if match:
176-
return match.group("agent")
197+
return match.group("agent"), {"source": "comment", "line": line.strip()}
177198
for author in self._fetch_review_authors(repo_full_name, pr_number):
178199
lower = author.lower()
179200
if "copilot" in lower:
180-
return "github-copilot"
201+
return "github-copilot", {"source": "reviewer", "value": author}
202+
mapped = self._agent_map.get(lower)
203+
if mapped:
204+
return mapped, {"source": "reviewer_map", "value": author}
181205
if any(key in lower for key in ("claude", "gemini", "gpt", "bard")):
182-
return lower
206+
return lower, {"source": "reviewer_heuristic", "value": author}
183207
if lower.endswith("-bot"):
184-
return lower
185-
return None
208+
return lower, {"source": "reviewer_bot", "value": author}
209+
return None, {"source": "discussion", "reason": "no_match"}
210+
211+
def _from_pr_body(self, repo_full_name: str, pr_number: int) -> tuple[Optional[str], dict]:
212+
try:
213+
repo = self._client.get_repo(repo_full_name)
214+
pr = repo.get_pull(pr_number)
215+
body = pr.body or ""
216+
except GithubException:
217+
return None, {"source": "body", "reason": "error"}
218+
for line in body.splitlines():
219+
match = AGENT_TRAILER_PATTERN.match(line.strip())
220+
if match:
221+
return match.group("agent"), {"source": "body", "line": line.strip()}
222+
lower_body = body.lower()
223+
for key, mapped in self._agent_map.items():
224+
if key in lower_body:
225+
return mapped, {"source": "body_map", "value": key}
226+
return None, {"source": "body", "reason": "no_match"}

app/services/analysis.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def _map_changed_line(
141141
provenance_marker=payload.attribution.provenance_marker,
142142
)
143143
if not attribution.agent.agent_id:
144-
agent_id, session_id = self._resolve_agent(
144+
agent_id, session_id, evidence = self._resolve_agent(
145145
repo=request.repo,
146146
pr_number=request.pr_number,
147147
commit_sha=attribution.commit_sha,
@@ -150,6 +150,8 @@ def _map_changed_line(
150150
attribution.agent.agent_id = agent_id
151151
if session_id:
152152
attribution.agent_session_id = session_id
153+
if evidence:
154+
attribution.provenance_marker = str(evidence)
153155
return ChangedLine(
154156
analysis_id=analysis_id,
155157
repo_id=request.repo,
@@ -172,9 +174,9 @@ def _resolve_agent(
172174
repo: str,
173175
pr_number: str,
174176
commit_sha: str | None,
175-
) -> tuple[str | None, str | None]:
177+
) -> tuple[str | None, str | None, dict]:
176178
if not self._github_resolver:
177-
return None, None
179+
return None, None, {}
178180
return self._github_resolver.resolve_agent(repo, pr_number, commit_sha)
179181

180182
def list_findings(self, analysis_id: str) -> list[Finding]:

tests/provenance/test_github_resolver.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,28 @@ def test_resolver_extracts_agent_from_commit_trailer(monkeypatch):
1818
"_fetch_commit",
1919
lambda self, repo, sha: StubCommit("Fix bug\nAgent-ID: claude-3-opus"),
2020
)
21-
agent, session = resolver.resolve_agent("acme/repo", "42", "abc123")
21+
agent, session, evidence = resolver.resolve_agent("acme/repo", "42", "abc123")
2222
assert agent == "claude-3-opus"
2323
assert session is None
24+
assert evidence["agent_id"] == "claude-3-opus"
2425

2526

2627
def test_resolver_uses_coauthor(monkeypatch):
2728
resolver = GitHubProvenanceResolver(token="token")
2829
message = "Refactor\nCo-authored-by: GitHub Copilot <copilot@example.com>"
2930
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_commit", lambda self, repo, sha: StubCommit(message))
30-
agent, _ = resolver.resolve_agent("acme/repo", None, "def456")
31+
agent, _, evidence = resolver.resolve_agent("acme/repo", None, "def456")
3132
assert agent == "github-copilot"
33+
assert evidence["agent_id"] == "github-copilot"
3234

3335

3436
def test_resolver_falls_back_to_pr_labels(monkeypatch):
3537
resolver = GitHubProvenanceResolver(token="token", agent_label_prefix="agent:")
3638
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_commit", lambda self, repo, sha: None)
3739
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_pr_labels", lambda self, repo, pr: ["Agent: gemini-pro"])
38-
agent, _ = resolver.resolve_agent("acme/repo", "77", None)
40+
agent, _, evidence = resolver.resolve_agent("acme/repo", "77", None)
3941
assert agent == "gemini-pro"
42+
assert evidence["agent_id"] == "gemini-pro"
4043

4144

4245
def test_resolver_uses_pr_comments(monkeypatch):
@@ -48,8 +51,9 @@ def test_resolver_uses_pr_comments(monkeypatch):
4851
"_fetch_pr_comments",
4952
lambda self, repo, pr: ["LGTM\nAgent-ID: gemma-7b"],
5053
)
51-
agent, _ = resolver.resolve_agent("acme/repo", "77", None)
54+
agent, _, evidence = resolver.resolve_agent("acme/repo", "77", None)
5255
assert agent == "gemma-7b"
56+
assert evidence["agent_id"] == "gemma-7b"
5357

5458

5559
def test_review_stats(monkeypatch):

0 commit comments

Comments
 (0)