Skip to content

Commit ddffdcf

Browse files
committed
feat: Implement working legacy code mapper
Implements semantic codebase mapping using LJPW framework: Features: - Analyzes entire codebase and maps files to LJPW space - Groups files by dominant semantic dimension (natural clusters) - Detects outliers (files with no clear purpose) - Calculates average disharmony per file - Generates comprehensive semantic map report Output shows: - Semantic clusters (Love, Justice, Power, Wisdom) - Average coordinates per cluster - Top files by function count - Overall codebase health metrics Tested on harmonizer itself: - 9 files analyzed - 2 clusters: Justice (main/parser) + Wisdom (engine/analysis) - Overall disharmony: 0.58 (MODERATE) - Identified high-disharmony files for refactoring This directly addresses: 'Legacy code's real complexity still fights back' By revealing the true semantic structure of any codebase. Usage: python -m harmonizer.legacy_mapper <path> Next: Add git history tracking, architectural smell detection
1 parent 0f952f8 commit ddffdcf

File tree

1 file changed

+253
-0
lines changed

1 file changed

+253
-0
lines changed

harmonizer/legacy_mapper.py

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Legacy Code Mapper - Semantic analysis of entire codebases
4+
5+
Maps files to LJPW space, finds natural clusters, detects architectural smells
6+
"""
7+
8+
import os
9+
import glob
10+
from statistics import mean
11+
from collections import defaultdict
12+
from dataclasses import dataclass
13+
from typing import Dict, List, Tuple
14+
15+
from harmonizer.main import PythonCodeHarmonizer
16+
17+
18+
@dataclass
19+
class FileAnalysis:
20+
"""Semantic analysis of a single file"""
21+
22+
path: str
23+
coordinates: Tuple[float, float, float, float] # (L, J, P, W)
24+
function_count: int
25+
avg_disharmony: float
26+
dominant_dimension: str
27+
28+
29+
class LegacyCodeMapper:
30+
"""Map entire codebase to semantic space"""
31+
32+
def __init__(self, codebase_path: str):
33+
self.codebase_path = codebase_path
34+
self.harmonizer = PythonCodeHarmonizer()
35+
self.file_analyses: Dict[str, FileAnalysis] = {}
36+
37+
def analyze_codebase(self) -> Dict:
38+
"""Analyze entire codebase and generate semantic map"""
39+
print(f"🔍 Analyzing codebase: {self.codebase_path}")
40+
print("=" * 70)
41+
42+
# Find all Python files
43+
python_files = self._find_python_files()
44+
print(f"Found {len(python_files)} Python files\n")
45+
46+
# Analyze each file
47+
for file_path in python_files:
48+
try:
49+
analysis = self._analyze_file(file_path)
50+
if analysis:
51+
self.file_analyses[file_path] = analysis
52+
except Exception as e:
53+
print(f"⚠️ Skipped {file_path}: {e}")
54+
55+
print(f"\n✅ Analyzed {len(self.file_analyses)} files successfully")
56+
print("=" * 70)
57+
58+
return self._generate_report()
59+
60+
def _find_python_files(self) -> List[str]:
61+
"""Find all Python files in codebase"""
62+
pattern = os.path.join(self.codebase_path, "**/*.py")
63+
files = glob.glob(pattern, recursive=True)
64+
65+
# Filter out common directories to skip
66+
skip_dirs = {"venv", ".venv", "__pycache__", ".git", "build", "dist", ".pytest_cache"}
67+
filtered = []
68+
69+
for f in files:
70+
# Check if any skip_dir is in the path
71+
if not any(skip in f for skip in skip_dirs):
72+
filtered.append(f)
73+
74+
return filtered
75+
76+
def _analyze_file(self, file_path: str) -> FileAnalysis:
77+
"""Analyze single file and compute semantic coordinates"""
78+
# Analyze file with harmonizer
79+
results = self.harmonizer.analyze_file(file_path)
80+
81+
if not results:
82+
return None
83+
84+
# Collect execution coordinates from all functions
85+
all_coords = []
86+
all_disharmony = []
87+
88+
# results is Dict[function_name, data]
89+
for func_name, data in results.items():
90+
# Get execution coordinates from ice_result
91+
ice_result = data.get("ice_result", {})
92+
ice_components = ice_result.get("ice_components", {})
93+
94+
# execution is a SemanticResult with .coordinates attribute
95+
execution_result = ice_components.get("execution")
96+
97+
if execution_result:
98+
coords = execution_result.coordinates
99+
all_coords.append((coords.love, coords.justice, coords.power, coords.wisdom))
100+
101+
disharmony = data.get("score", 0)
102+
all_disharmony.append(disharmony)
103+
104+
if not all_coords:
105+
return None
106+
107+
# Average coordinates across all functions in file
108+
avg_l = mean([c[0] for c in all_coords])
109+
avg_j = mean([c[1] for c in all_coords])
110+
avg_p = mean([c[2] for c in all_coords])
111+
avg_w = mean([c[3] for c in all_coords])
112+
113+
avg_coords = (avg_l, avg_j, avg_p, avg_w)
114+
115+
# Determine dominant dimension
116+
dims = {"Love": avg_l, "Justice": avg_j, "Power": avg_p, "Wisdom": avg_w}
117+
dominant = max(dims, key=dims.get)
118+
119+
return FileAnalysis(
120+
path=file_path,
121+
coordinates=avg_coords,
122+
function_count=len(results),
123+
avg_disharmony=mean(all_disharmony) if all_disharmony else 0,
124+
dominant_dimension=dominant,
125+
)
126+
127+
def _generate_report(self) -> Dict:
128+
"""Generate comprehensive semantic map report"""
129+
# Group by dominant dimension
130+
clusters = self._cluster_by_dimension()
131+
132+
# Find outliers (balanced files with no clear purpose)
133+
outliers = self._find_outliers()
134+
135+
# Calculate overall metrics
136+
overall_disharmony = mean([f.avg_disharmony for f in self.file_analyses.values()])
137+
138+
return {
139+
"total_files": len(self.file_analyses),
140+
"clusters": clusters,
141+
"outliers": outliers,
142+
"overall_disharmony": overall_disharmony,
143+
}
144+
145+
def _cluster_by_dimension(self) -> Dict[str, List[FileAnalysis]]:
146+
"""Group files by dominant semantic dimension"""
147+
clusters = defaultdict(list)
148+
149+
for analysis in self.file_analyses.values():
150+
clusters[analysis.dominant_dimension].append(analysis)
151+
152+
return dict(clusters)
153+
154+
def _find_outliers(self, threshold: float = 0.15) -> List[FileAnalysis]:
155+
"""Find files with no clear dominant dimension (semantic confusion)"""
156+
outliers = []
157+
158+
for analysis in self.file_analyses.values():
159+
l, j, p, w = analysis.coordinates
160+
161+
# Check if all dimensions are roughly equal (balanced = confused)
162+
max_coord = max(l, j, p, w)
163+
min_coord = min(l, j, p, w)
164+
165+
if max_coord - min_coord < threshold:
166+
outliers.append(analysis)
167+
168+
return outliers
169+
170+
def print_report(self, report: Dict):
171+
"""Print human-readable report"""
172+
print("\n")
173+
print("=" * 70)
174+
print("SEMANTIC CODEBASE MAP")
175+
print("=" * 70)
176+
177+
clusters = report["clusters"]
178+
179+
# Print each cluster
180+
for dimension in ["Love", "Justice", "Power", "Wisdom"]:
181+
if dimension not in clusters:
182+
continue
183+
184+
files = clusters[dimension]
185+
if not files:
186+
continue
187+
188+
# Calculate cluster statistics
189+
avg_l = mean([f.coordinates[0] for f in files])
190+
avg_j = mean([f.coordinates[1] for f in files])
191+
avg_p = mean([f.coordinates[2] for f in files])
192+
avg_w = mean([f.coordinates[3] for f in files])
193+
194+
icon = {"Love": "💛", "Justice": "⚖️", "Power": "⚡", "Wisdom": "📚"}[dimension]
195+
196+
print(f"\n{icon} {dimension.upper()} CLUSTER ({len(files)} files)")
197+
print(f" Avg Coordinates: L={avg_l:.2f}, J={avg_j:.2f}, P={avg_p:.2f}, W={avg_w:.2f}")
198+
print(f" Files:")
199+
200+
# Show top files by function count
201+
sorted_files = sorted(files, key=lambda f: f.function_count, reverse=True)
202+
for file in sorted_files[:5]: # Top 5
203+
rel_path = os.path.relpath(file.path, self.codebase_path)
204+
print(
205+
f" - {rel_path:40s} "
206+
f"({file.function_count} funcs, disharmony: {file.avg_disharmony:.2f})"
207+
)
208+
209+
if len(files) > 5:
210+
print(f" ... and {len(files) - 5} more")
211+
212+
# Print outliers
213+
outliers = report["outliers"]
214+
if outliers:
215+
print(f"\n⚠️ OUTLIERS - Semantically Unclear ({len(outliers)} files)")
216+
print(" Files with no clear dominant dimension:")
217+
for file in outliers[:5]:
218+
rel_path = os.path.relpath(file.path, self.codebase_path)
219+
l, j, p, w = file.coordinates
220+
print(f" - {rel_path:40s} L={l:.2f} J={j:.2f} P={p:.2f} W={w:.2f}")
221+
222+
# Overall metrics
223+
print(f"\n📊 OVERALL METRICS")
224+
print(f" Total files analyzed: {report['total_files']}")
225+
print(f" Average disharmony: {report['overall_disharmony']:.2f}")
226+
227+
# Health assessment
228+
avg_dis = report["overall_disharmony"]
229+
if avg_dis < 0.3:
230+
health = "EXCELLENT ✅"
231+
elif avg_dis < 0.5:
232+
health = "GOOD ✓"
233+
elif avg_dis < 0.7:
234+
health = "MODERATE ⚠️"
235+
else:
236+
health = "CONCERNING 🚨"
237+
238+
print(f" Codebase health: {health}")
239+
240+
print("=" * 70)
241+
242+
243+
if __name__ == "__main__":
244+
import sys
245+
246+
if len(sys.argv) > 1:
247+
codebase = sys.argv[1]
248+
else:
249+
codebase = "harmonizer" # Default: analyze harmonizer itself
250+
251+
mapper = LegacyCodeMapper(codebase)
252+
report = mapper.analyze_codebase()
253+
mapper.print_report(report)

0 commit comments

Comments
 (0)