Skip to content

Commit f3ae37c

Browse files
committed
feat: enrich provenance with github heuristics
1 parent adff1ea commit f3ae37c

File tree

9 files changed

+223
-3
lines changed

9 files changed

+223
-3
lines changed

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ PROVENANCE_OTEL_EXPORTER=console
2121
PROVENANCE_POLICY_WARN_THRESHOLDS={}
2222
PROVENANCE_POLICY_BLOCK_THRESHOLDS={}
2323
PROVENANCE_DETECTOR_MODULE_PATHS=[]
24+
PROVENANCE_GITHUB_TOKEN=
25+
PROVENANCE_GITHUB_BASE_URL=
26+
PROVENANCE_GITHUB_AGENT_LABEL_PREFIX=agent:
2427
PROVENANCE_SEMGREP_CONFIG_PATH=
2528
PROVENANCE_DASHBOARD_API=http://localhost:8000/v1
2629
PROVENANCE_DASHBOARD_EVENTS=data/timeseries_events.jsonl

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ Copy `.env.example` to `.env` and adjust values locally if you prefer dotenv-sty
7777
| `PROVENANCE_POLICY_WARN_THRESHOLDS` | JSON map of category warn thresholds | `{}` |
7878
| `PROVENANCE_POLICY_BLOCK_THRESHOLDS` | JSON map of category block thresholds | `{}` |
7979
| `PROVENANCE_DETECTOR_MODULE_PATHS` | JSON array of detector module paths to auto-load | `[]` |
80+
| `PROVENANCE_GITHUB_TOKEN` | Personal access token for GitHub API enrichment | *(unset)* |
81+
| `PROVENANCE_GITHUB_BASE_URL` | GitHub enterprise base URL (optional) | *(unset)* |
82+
| `PROVENANCE_GITHUB_AGENT_LABEL_PREFIX` | PR label prefix used to infer agent IDs | `agent:` |
8083

8184
## Detection with Semgrep
8285

@@ -91,6 +94,7 @@ Copy `.env.example` to `.env` and adjust values locally if you prefer dotenv-sty
9194
- The JSON results are mapped back to the originating changed lines so findings retain repo/PR/file/line attribution.
9295
- Extend the rule pack or point the detector at your organization-wide Semgrep registry by updating `SemgrepDetector` in `app/services/detection.py`.
9396
- Register additional detectors by providing module paths in `PROVENANCE_DETECTOR_MODULE_PATHS`; each module should expose `register_detectors()` returning `BaseDetector` instances.
97+
- When GitHub credentials are configured, the service automatically inspects commit trailers and PR labels to fill missing agent attribution (see `app/provenance/github_resolver.py`).
9498

9599
## API Surface
96100

app/core/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ class Settings(BaseSettings):
3434
policy_warn_thresholds: dict[str, int] = Field(default_factory=dict)
3535
policy_block_thresholds: dict[str, int] = Field(default_factory=dict)
3636
detector_module_paths: list[str] = Field(default_factory=list)
37+
github_token: str | None = None
38+
github_base_url: str | None = None
39+
github_agent_label_prefix: str = "agent:"
3740

3841
model_config = SettingsConfigDict(env_prefix="provenance_", env_file=".env", extra="ignore")
3942

app/dependencies.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from app.services.detection import DetectionService
1414
from app.services.governance import GovernanceService
1515
from app.telemetry import sink_from_settings, EventSink
16+
from app.provenance.github_resolver import GitHubProvenanceResolver
1617

1718

1819
@lru_cache
@@ -45,10 +46,22 @@ def get_governance_service() -> GovernanceService:
4546
return GovernanceService()
4647

4748

49+
@lru_cache
50+
def get_github_resolver() -> GitHubProvenanceResolver | None:
51+
if not settings.github_token:
52+
return None
53+
return GitHubProvenanceResolver(
54+
token=settings.github_token,
55+
base_url=settings.github_base_url,
56+
agent_label_prefix=settings.github_agent_label_prefix,
57+
)
58+
59+
4860
@lru_cache
4961
def get_analysis_service() -> AnalysisService:
5062
store = get_store()
5163
detection = get_detection_service()
5264
analytics = get_analytics_service()
5365
governance = get_governance_service()
54-
return AnalysisService(store, detection, governance, analytics)
66+
github_resolver = get_github_resolver()
67+
return AnalysisService(store, detection, governance, analytics, github_resolver)

app/provenance/github_resolver.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""GitHub-backed provenance resolution."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
from functools import lru_cache
7+
from typing import Optional
8+
9+
from github import Github, GithubException, Commit
10+
11+
AGENT_TRAILER_PATTERN = re.compile(r"^Agent-ID:\s*(?P<agent>[^\s]+)", re.IGNORECASE)
12+
CO_AUTHOR_PATTERN = re.compile(r"Co-authored-by:\s*(?P<author>.+)", re.IGNORECASE)
13+
14+
15+
class GitHubProvenanceResolver:
16+
"""Resolve agent attribution using GitHub commit and PR metadata."""
17+
18+
def __init__(
19+
self,
20+
token: str,
21+
*,
22+
base_url: str | None = None,
23+
agent_label_prefix: str = "agent:",
24+
) -> None:
25+
self._agent_label_prefix = agent_label_prefix.lower()
26+
if base_url:
27+
self._client = Github(login_or_token=token, base_url=base_url.rstrip("/"))
28+
else:
29+
self._client = Github(login_or_token=token)
30+
31+
def resolve_agent(
32+
self,
33+
repo_full_name: str,
34+
pr_number: str | None,
35+
commit_sha: str | None,
36+
) -> tuple[Optional[str], Optional[str]]:
37+
agent_id: Optional[str] = None
38+
session_id: Optional[str] = None
39+
40+
if commit_sha:
41+
agent_id, session_id = self._from_commit(repo_full_name, commit_sha)
42+
if not agent_id and pr_number:
43+
agent_id = self._from_pr_labels(repo_full_name, int(pr_number))
44+
return agent_id, session_id
45+
46+
@lru_cache(maxsize=256)
47+
def _fetch_commit(self, repo_full_name: str, sha: str) -> Optional[Commit.Commit]:
48+
try:
49+
repo = self._client.get_repo(repo_full_name)
50+
return repo.get_commit(sha)
51+
except GithubException:
52+
return None
53+
54+
def _from_commit(self, repo_full_name: str, sha: str) -> tuple[Optional[str], Optional[str]]:
55+
commit = self._fetch_commit(repo_full_name, sha)
56+
if not commit:
57+
return None, None
58+
message = commit.commit.message or ""
59+
for line in message.splitlines():
60+
match = AGENT_TRAILER_PATTERN.match(line.strip())
61+
if match:
62+
return match.group("agent"), None
63+
for line in message.splitlines():
64+
match = CO_AUTHOR_PATTERN.match(line.strip())
65+
if match and "copilot" in match.group("author").lower():
66+
return "github-copilot", None
67+
author_login = getattr(commit.author, "login", "") or ""
68+
if author_login:
69+
return author_login, None
70+
return None, None
71+
72+
@lru_cache(maxsize=256)
73+
def _fetch_pr_labels(self, repo_full_name: str, pr_number: int) -> list[str]:
74+
try:
75+
repo = self._client.get_repo(repo_full_name)
76+
pr = repo.get_pull(pr_number)
77+
return [label.name for label in pr.get_labels()]
78+
except GithubException:
79+
return []
80+
81+
def _from_pr_labels(self, repo_full_name: str, pr_number: int) -> Optional[str]:
82+
for label in self._fetch_pr_labels(repo_full_name, pr_number):
83+
lower = label.lower()
84+
if lower.startswith(self._agent_label_prefix):
85+
return label.split(":", 1)[-1].strip()
86+
return None

app/services/analysis.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from datetime import datetime, timezone
66
import time
7+
from typing import TYPE_CHECKING
78

89
from fastapi import BackgroundTasks
910

@@ -23,6 +24,9 @@
2324
from app.services.governance import GovernanceService
2425
from app.telemetry import increment_analysis_ingestion, record_analysis_duration, record_analysis_findings
2526

27+
if TYPE_CHECKING:
28+
from app.provenance.github_resolver import GitHubProvenanceResolver
29+
2630

2731
def _now() -> datetime:
2832
return datetime.now(timezone.utc)
@@ -37,11 +41,13 @@ def __init__(
3741
detection_service: DetectionService,
3842
governance_service: GovernanceService,
3943
analytics_service: AnalyticsService,
44+
github_resolver: "GitHubProvenanceResolver | None" = None,
4045
) -> None:
4146
self._store = store
4247
self._detection = detection_service
4348
self._governance = governance_service
4449
self._analytics = analytics_service
50+
self._github_resolver = github_resolver
4551

4652
def ingest_analysis(
4753
self,
@@ -108,8 +114,8 @@ def execute_analysis(self, analysis_id: str) -> None:
108114
record.error_message = str(exc)
109115
self._store.update_analysis(record)
110116

111-
@staticmethod
112117
def _map_changed_line(
118+
self,
113119
analysis_id: str,
114120
request: AnalysisIngestionRequest,
115121
payload: ChangedLinePayload,
@@ -124,6 +130,16 @@ def _map_changed_line(
124130
commit_sha=payload.attribution.commit_sha,
125131
provenance_marker=payload.attribution.provenance_marker,
126132
)
133+
if not attribution.agent.agent_id:
134+
agent_id, session_id = self._resolve_agent(
135+
repo=request.repo,
136+
pr_number=request.pr_number,
137+
commit_sha=attribution.commit_sha,
138+
)
139+
if agent_id:
140+
attribution.agent.agent_id = agent_id
141+
if session_id:
142+
attribution.agent_session_id = session_id
127143
return ChangedLine(
128144
analysis_id=analysis_id,
129145
repo_id=request.repo,
@@ -140,6 +156,17 @@ def _map_changed_line(
140156
attribution=attribution,
141157
)
142158

159+
def _resolve_agent(
160+
self,
161+
*,
162+
repo: str,
163+
pr_number: str,
164+
commit_sha: str | None,
165+
) -> tuple[str | None, str | None]:
166+
if not self._github_resolver:
167+
return None, None
168+
return self._github_resolver.resolve_agent(repo, pr_number, commit_sha)
169+
143170
def list_findings(self, analysis_id: str) -> list[Finding]:
144171
return self._store.list_findings(analysis_id)
145172

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ dependencies = [
1313
"redis>=5.0,<6.0",
1414
"semgrep>=1.64,<2.0",
1515
"opentelemetry-api>=1.25,<2.0",
16-
"opentelemetry-sdk>=1.25,<2.0"
16+
"opentelemetry-sdk>=1.25,<2.0",
17+
"PyGithub>=2.4,<3.0"
1718
]
1819

1920
[project.optional-dependencies]
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from __future__ import annotations
2+
3+
from types import SimpleNamespace
4+
5+
from app.provenance.github_resolver import GitHubProvenanceResolver
6+
7+
8+
class StubCommit:
9+
def __init__(self, message: str, author_login: str | None = None):
10+
self.commit = SimpleNamespace(message=message)
11+
self.author = SimpleNamespace(login=author_login) if author_login else None
12+
13+
14+
def test_resolver_extracts_agent_from_commit_trailer(monkeypatch):
15+
resolver = GitHubProvenanceResolver(token="token")
16+
monkeypatch.setattr(
17+
GitHubProvenanceResolver,
18+
"_fetch_commit",
19+
lambda self, repo, sha: StubCommit("Fix bug\nAgent-ID: claude-3-opus"),
20+
)
21+
agent, session = resolver.resolve_agent("acme/repo", "42", "abc123")
22+
assert agent == "claude-3-opus"
23+
assert session is None
24+
25+
26+
def test_resolver_uses_coauthor(monkeypatch):
27+
resolver = GitHubProvenanceResolver(token="token")
28+
message = "Refactor\nCo-authored-by: GitHub Copilot <copilot@example.com>"
29+
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_commit", lambda self, repo, sha: StubCommit(message))
30+
agent, _ = resolver.resolve_agent("acme/repo", None, "def456")
31+
assert agent == "github-copilot"
32+
33+
34+
def test_resolver_falls_back_to_pr_labels(monkeypatch):
35+
resolver = GitHubProvenanceResolver(token="token", agent_label_prefix="agent:")
36+
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_commit", lambda self, repo, sha: None)
37+
monkeypatch.setattr(GitHubProvenanceResolver, "_fetch_pr_labels", lambda self, repo, pr: ["Agent: gemini-pro"])
38+
agent, _ = resolver.resolve_agent("acme/repo", "77", None)
39+
assert agent == "gemini-pro"

uv.lock

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)