Skip to content

Commit de74c83

Browse files
feat: enhance risk scoring with git mining, coverage parsing, and jules rules
- Implemented `GitMiner` for historical churn and author analysis. - Implemented `CoverageParser` for Cobertura, JaCoCo, and Go coverage reports. - Enhanced `RiskScorer` with multi-dimensional weighted scoring model. - Added `JulesRule` set for detecting LLM-generated code patterns. - Integrated new components into `codesage scan` CLI. - Added unit and integration tests for new features.
1 parent 414e866 commit de74c83

File tree

13 files changed

+1279
-273
lines changed

13 files changed

+1279
-273
lines changed

codesage/cli/commands/scan.py

Lines changed: 77 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
from codesage.cli.plugin_loader import PluginManager
1414
from codesage.history.store import StorageEngine
1515
from codesage.core.interfaces import CodeIssue
16+
from codesage.risk.risk_scorer import RiskScorer
17+
from codesage.config.risk_baseline import RiskBaselineConfig
18+
from codesage.rules.jules_specific_rules import JULES_RULESET
19+
from codesage.rules.base import RuleContext
1620
from datetime import datetime, timezone
1721

1822
def get_builder(language: str, path: Path):
@@ -144,8 +148,10 @@ def merge_snapshots(snapshots: List[ProjectSnapshot], project_name: str) -> Proj
144148
@click.option('--ci-mode', is_flag=True, help='Enable CI mode (auto-detect GitHub environment).')
145149
@click.option('--plugins-dir', default='.codesage/plugins', help='Directory containing plugins.')
146150
@click.option('--db-url', default='sqlite:///codesage.db', help='Database URL for storage.')
151+
@click.option('--git-repo', type=click.Path(), help='Git 仓库路径(用于变更历史分析)')
152+
@click.option('--coverage-report', type=click.Path(), help='覆盖率报告路径(Cobertura/JaCoCo XML)')
147153
@click.pass_context
148-
def scan(ctx, path, language, reporter, output, fail_on_high, ci_mode, plugins_dir, db_url):
154+
def scan(ctx, path, language, reporter, output, fail_on_high, ci_mode, plugins_dir, db_url, git_repo, coverage_report):
149155
"""
150156
Scan the codebase and report issues.
151157
"""
@@ -205,16 +211,73 @@ def scan(ctx, path, language, reporter, output, fail_on_high, ci_mode, plugins_d
205211
click.echo(f"Failed to merge snapshots: {e}", err=True)
206212
ctx.exit(1)
207213

208-
# 3. Apply Custom Rules (Plugins)
214+
# Populate file contents if missing (needed for rules)
215+
click.echo("Populating file contents...")
216+
for file_snapshot in snapshot.files:
217+
if not file_snapshot.content:
218+
try:
219+
full_path = root_path / file_snapshot.path
220+
if full_path.exists():
221+
file_snapshot.content = full_path.read_text(errors='ignore')
222+
# Update size if missing
223+
if file_snapshot.size is None:
224+
file_snapshot.size = len(file_snapshot.content)
225+
except Exception as e:
226+
# logger.warning(f"Failed to read file {file_snapshot.path}: {e}")
227+
pass
228+
229+
# 3. Apply Risk Scoring (Enhanced in Phase 1)
230+
try:
231+
risk_config = RiskBaselineConfig() # Load default config
232+
scorer = RiskScorer(
233+
config=risk_config,
234+
repo_path=git_repo or path, # Default to scanned path if not specified
235+
coverage_report=coverage_report
236+
)
237+
snapshot = scorer.score_project(snapshot)
238+
except Exception as e:
239+
click.echo(f"Warning: Risk scoring failed: {e}", err=True)
240+
241+
# 4. Apply Custom Rules (Plugins & Jules Rules)
242+
243+
# Create RuleContext
244+
# We need a dummy config for now as RuleContext expects one, but JulesRules might not use it.
245+
# However, PythonRulesetBaselineConfig is expected by RuleContext definition in base.py.
246+
# We need to import it or mock it.
247+
from codesage.config.rules_python_baseline import RulesPythonBaselineConfig
248+
rule_config = RulesPythonBaselineConfig() # Default config
249+
250+
# Apply Jules Specific Rules
251+
click.echo("Applying Jules-specific rules...")
252+
for rule in JULES_RULESET:
253+
for file_snapshot in snapshot.files:
254+
try:
255+
# Create context for this file
256+
rule_ctx = RuleContext(
257+
project=snapshot,
258+
file=file_snapshot,
259+
config=rule_config
260+
)
261+
262+
# Call rule.check(ctx)
263+
# Ensure rule supports check(ctx)
264+
issues = rule.check(rule_ctx)
265+
266+
if issues:
267+
if file_snapshot.issues is None:
268+
file_snapshot.issues = []
269+
file_snapshot.issues.extend(issues)
270+
except Exception as e:
271+
click.echo(f"Error applying rule {rule.rule_id} to {file_snapshot.path}: {e}", err=True)
272+
273+
# Apply Plugin Rules
209274
for rule in plugin_manager.rules:
210275
# Ensure we iterate over the list of files
211276
for file_snapshot in snapshot.files:
212277
file_path = Path(file_snapshot.path)
213278
try:
214-
content = ""
215-
full_path = root_path / file_path
216-
if full_path.exists():
217-
content = full_path.read_text(errors='ignore')
279+
# Content is already populated now
280+
content = file_snapshot.content or ""
218281

219282
issues = rule.check(str(file_path), content, {})
220283
if issues:
@@ -249,29 +312,33 @@ def scan(ctx, path, language, reporter, output, fail_on_high, ci_mode, plugins_d
249312
except Exception as e:
250313
click.echo(f"Error running rule {rule.id} on {file_path}: {e}", err=True)
251314

252-
# Recalculate Issues Summary after Plugins
253-
# Simplified recalculation
315+
# Recalculate Issues Summary after Plugins & Jules Rules
254316
total_issues = 0
255317
by_severity = {}
318+
by_rule = {}
256319

257320
for f in snapshot.files:
258321
if f.issues:
259322
total_issues += len(f.issues)
260323
for issue in f.issues:
261324
by_severity[issue.severity] = by_severity.get(issue.severity, 0) + 1
325+
if issue.rule_id:
326+
by_rule[issue.rule_id] = by_rule.get(issue.rule_id, 0) + 1
262327

263328
# Update snapshot summary if issues changed
264329
if snapshot.issues_summary:
265330
snapshot.issues_summary.total_issues = total_issues
266331
snapshot.issues_summary.by_severity = by_severity
332+
snapshot.issues_summary.by_rule = by_rule
267333
else:
268334
snapshot.issues_summary = ProjectIssuesSummary(
269335
total_issues=total_issues,
270-
by_severity=by_severity
336+
by_severity=by_severity,
337+
by_rule=by_rule
271338
)
272339

273340

274-
# 4. Save to Storage
341+
# 5. Save to Storage
275342
if storage:
276343
try:
277344
storage.save_snapshot(snapshot.metadata.project_name, snapshot)

codesage/config/risk_baseline.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from pydantic import BaseModel
1+
from pydantic import BaseModel, Field
22

33
class RiskBaselineConfig(BaseModel):
44
"""Configuration for the baseline risk scorer."""
@@ -9,13 +9,16 @@ class RiskBaselineConfig(BaseModel):
99
weight_fan_out: float = 0.2
1010
weight_loc: float = 0.1
1111

12-
# Weights for multi-dimensional scoring
13-
# Final = w_static * static + w_churn * churn + w_cov * (static * (1-cov))
14-
# Or as per task: Score = w1 * Complexity + w2 * Churn + w3 * (1 - Coverage)
15-
# The "Complexity" here refers to the static score calculated above.
12+
# Weights for multi-dimensional scoring (New Model)
13+
# Risk = w1·Complexity + w2·Churn + w3·(1-Coverage) + w4·AuthorDiversity + w5·FileSize
14+
weight_complexity: float = Field(default=0.30, description="Weight for complexity score")
15+
weight_churn: float = Field(default=0.25, description="Weight for git churn score")
16+
weight_coverage: float = Field(default=0.25, description="Weight for coverage risk")
17+
weight_author_diversity: float = Field(default=0.10, description="Weight for author diversity")
18+
weight_file_size: float = Field(default=0.10, description="Weight for file size (LOC)")
1619

20+
# Legacy weights (kept for backward compatibility if needed, but new model supersedes)
1721
weight_static_score: float = 0.5
18-
weight_churn: float = 0.3
1922
weight_coverage_penalty: float = 0.2
2023

2124
# Propagation
@@ -29,7 +32,7 @@ class RiskBaselineConfig(BaseModel):
2932

3033
# Churn settings
3134
churn_since_days: int = 90
32-
threshold_churn_high: int = 10 # If file changed > 10 times in 90 days, normalized churn = 1.0
35+
threshold_churn_high: int = 10
3336

3437
@classmethod
3538
def from_defaults(cls) -> "RiskBaselineConfig":

codesage/git/miner.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""Git 历史数据挖掘器
2+
实现架构设计第 3.1.3 节的"代码演化分析"能力
3+
"""
4+
from datetime import datetime, timedelta
5+
import logging
6+
from typing import Dict, List, Optional, Set
7+
import os
8+
9+
try:
10+
from git import Repo, InvalidGitRepositoryError
11+
except ImportError:
12+
Repo = None
13+
InvalidGitRepositoryError = None
14+
15+
logger = logging.getLogger(__name__)
16+
17+
class GitMiner:
18+
"""Git 历史挖掘器
19+
20+
核心指标(对齐架构设计):
21+
- 变更频率: 近 N 天内的提交次数
22+
- 文件热度: 累计变更行数 / 文件总行数 (这里简化为变更次数,后续可扩展)
23+
- 作者分散度: 不同作者数量(高分散度 = 高风险)
24+
"""
25+
26+
def __init__(self, repo_path: Optional[str] = None):
27+
self.repo_path = repo_path or os.getcwd()
28+
self.repo = None
29+
self._churn_cache: Dict[str, int] = {}
30+
self._author_cache: Dict[str, Set[str]] = {}
31+
self._cache_initialized = False
32+
33+
if Repo:
34+
try:
35+
self.repo = Repo(self.repo_path, search_parent_directories=True)
36+
except (InvalidGitRepositoryError, Exception) as e:
37+
logger.warning(f"Failed to initialize Git repo at {self.repo_path}: {e}")
38+
39+
def _initialize_stats(self, days: int = 90):
40+
"""Bulk process commits to populate caches."""
41+
if self._cache_initialized:
42+
return
43+
44+
if not self.repo:
45+
return
46+
47+
try:
48+
since_date = datetime.now() - timedelta(days=days)
49+
# Use traverse_commits for potentially faster iteration if supported, otherwise standard iteration
50+
# Iterating over all commits once is O(N_commits * M_files_changed) which is better than O(F_files * N_commits)
51+
commits = self.repo.iter_commits(since=since_date)
52+
53+
for commit in commits:
54+
# stats.files returns dict {path: stats}
55+
for file_path in commit.stats.files.keys():
56+
self._churn_cache[file_path] = self._churn_cache.get(file_path, 0) + 1
57+
58+
if file_path not in self._author_cache:
59+
self._author_cache[file_path] = set()
60+
self._author_cache[file_path].add(commit.author.email)
61+
62+
self._cache_initialized = True
63+
except Exception as e:
64+
logger.error(f"Error initializing git stats: {e}")
65+
66+
def get_file_churn_score(self, file_path: str, days: int = 90) -> float:
67+
"""计算文件变更频率评分(0-10)
68+
69+
算法: score = min(10, commit_count / (days / 30))
70+
- 月均 1 次提交 = 1 分
71+
- 月均 10 次提交 = 10 分(满分)
72+
"""
73+
if not self.repo:
74+
return 0.0
75+
76+
# Ensure cache is populated
77+
self._initialize_stats(days)
78+
79+
# We need exact path match.
80+
# Note: git paths are relative to repo root. `file_path` usually is relative too.
81+
# But we might need normalization if `file_path` comes from different source.
82+
# Assuming consistency for now.
83+
84+
commit_count = self._churn_cache.get(file_path, 0)
85+
86+
denominator = max(days / 30, 1) # avoid division by zero
87+
score = min(10.0, commit_count / denominator)
88+
return round(score, 2)
89+
90+
def get_file_author_count(self, file_path: str) -> int:
91+
"""统计文件的历史贡献者数量
92+
93+
用于评估"维护一致性风险":
94+
- 1 人维护: 低风险(知识集中)
95+
- 5+ 人维护: 高风险(理解成本高)
96+
"""
97+
if not self.repo:
98+
return 0
99+
100+
self._initialize_stats()
101+
102+
authors = self._author_cache.get(file_path, set())
103+
return len(authors)
104+
105+
def get_hotspot_files(self, top_n: int = 20) -> List[Dict]:
106+
"""识别代码热点(高频变更文件)
107+
"""
108+
if not self.repo:
109+
return []
110+
111+
self._initialize_stats()
112+
113+
sorted_files = sorted(self._churn_cache.items(), key=lambda x: x[1], reverse=True)[:top_n]
114+
115+
result = []
116+
for path, count in sorted_files:
117+
result.append({
118+
"path": path,
119+
"commits": count
120+
})
121+
return result

0 commit comments

Comments
 (0)