|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Legacy Code Mapper - Semantic analysis of entire codebases |
| 4 | +
|
| 5 | +Maps files to LJPW space, finds natural clusters, detects architectural smells |
| 6 | +""" |
| 7 | + |
| 8 | +import os |
| 9 | +import glob |
| 10 | +from statistics import mean |
| 11 | +from collections import defaultdict |
| 12 | +from dataclasses import dataclass |
| 13 | +from typing import Dict, List, Tuple |
| 14 | + |
| 15 | +from harmonizer.main import PythonCodeHarmonizer |
| 16 | + |
| 17 | + |
| 18 | +@dataclass |
| 19 | +class FileAnalysis: |
| 20 | + """Semantic analysis of a single file""" |
| 21 | + |
| 22 | + path: str |
| 23 | + coordinates: Tuple[float, float, float, float] # (L, J, P, W) |
| 24 | + function_count: int |
| 25 | + avg_disharmony: float |
| 26 | + dominant_dimension: str |
| 27 | + |
| 28 | + |
| 29 | +class LegacyCodeMapper: |
| 30 | + """Map entire codebase to semantic space""" |
| 31 | + |
| 32 | + def __init__(self, codebase_path: str): |
| 33 | + self.codebase_path = codebase_path |
| 34 | + self.harmonizer = PythonCodeHarmonizer() |
| 35 | + self.file_analyses: Dict[str, FileAnalysis] = {} |
| 36 | + |
| 37 | + def analyze_codebase(self) -> Dict: |
| 38 | + """Analyze entire codebase and generate semantic map""" |
| 39 | + print(f"🔍 Analyzing codebase: {self.codebase_path}") |
| 40 | + print("=" * 70) |
| 41 | + |
| 42 | + # Find all Python files |
| 43 | + python_files = self._find_python_files() |
| 44 | + print(f"Found {len(python_files)} Python files\n") |
| 45 | + |
| 46 | + # Analyze each file |
| 47 | + for file_path in python_files: |
| 48 | + try: |
| 49 | + analysis = self._analyze_file(file_path) |
| 50 | + if analysis: |
| 51 | + self.file_analyses[file_path] = analysis |
| 52 | + except Exception as e: |
| 53 | + print(f"⚠️ Skipped {file_path}: {e}") |
| 54 | + |
| 55 | + print(f"\n✅ Analyzed {len(self.file_analyses)} files successfully") |
| 56 | + print("=" * 70) |
| 57 | + |
| 58 | + return self._generate_report() |
| 59 | + |
| 60 | + def _find_python_files(self) -> List[str]: |
| 61 | + """Find all Python files in codebase""" |
| 62 | + pattern = os.path.join(self.codebase_path, "**/*.py") |
| 63 | + files = glob.glob(pattern, recursive=True) |
| 64 | + |
| 65 | + # Filter out common directories to skip |
| 66 | + skip_dirs = {"venv", ".venv", "__pycache__", ".git", "build", "dist", ".pytest_cache"} |
| 67 | + filtered = [] |
| 68 | + |
| 69 | + for f in files: |
| 70 | + # Check if any skip_dir is in the path |
| 71 | + if not any(skip in f for skip in skip_dirs): |
| 72 | + filtered.append(f) |
| 73 | + |
| 74 | + return filtered |
| 75 | + |
| 76 | + def _analyze_file(self, file_path: str) -> FileAnalysis: |
| 77 | + """Analyze single file and compute semantic coordinates""" |
| 78 | + # Analyze file with harmonizer |
| 79 | + results = self.harmonizer.analyze_file(file_path) |
| 80 | + |
| 81 | + if not results: |
| 82 | + return None |
| 83 | + |
| 84 | + # Collect execution coordinates from all functions |
| 85 | + all_coords = [] |
| 86 | + all_disharmony = [] |
| 87 | + |
| 88 | + # results is Dict[function_name, data] |
| 89 | + for func_name, data in results.items(): |
| 90 | + # Get execution coordinates from ice_result |
| 91 | + ice_result = data.get("ice_result", {}) |
| 92 | + ice_components = ice_result.get("ice_components", {}) |
| 93 | + |
| 94 | + # execution is a SemanticResult with .coordinates attribute |
| 95 | + execution_result = ice_components.get("execution") |
| 96 | + |
| 97 | + if execution_result: |
| 98 | + coords = execution_result.coordinates |
| 99 | + all_coords.append((coords.love, coords.justice, coords.power, coords.wisdom)) |
| 100 | + |
| 101 | + disharmony = data.get("score", 0) |
| 102 | + all_disharmony.append(disharmony) |
| 103 | + |
| 104 | + if not all_coords: |
| 105 | + return None |
| 106 | + |
| 107 | + # Average coordinates across all functions in file |
| 108 | + avg_l = mean([c[0] for c in all_coords]) |
| 109 | + avg_j = mean([c[1] for c in all_coords]) |
| 110 | + avg_p = mean([c[2] for c in all_coords]) |
| 111 | + avg_w = mean([c[3] for c in all_coords]) |
| 112 | + |
| 113 | + avg_coords = (avg_l, avg_j, avg_p, avg_w) |
| 114 | + |
| 115 | + # Determine dominant dimension |
| 116 | + dims = {"Love": avg_l, "Justice": avg_j, "Power": avg_p, "Wisdom": avg_w} |
| 117 | + dominant = max(dims, key=dims.get) |
| 118 | + |
| 119 | + return FileAnalysis( |
| 120 | + path=file_path, |
| 121 | + coordinates=avg_coords, |
| 122 | + function_count=len(results), |
| 123 | + avg_disharmony=mean(all_disharmony) if all_disharmony else 0, |
| 124 | + dominant_dimension=dominant, |
| 125 | + ) |
| 126 | + |
| 127 | + def _generate_report(self) -> Dict: |
| 128 | + """Generate comprehensive semantic map report""" |
| 129 | + # Group by dominant dimension |
| 130 | + clusters = self._cluster_by_dimension() |
| 131 | + |
| 132 | + # Find outliers (balanced files with no clear purpose) |
| 133 | + outliers = self._find_outliers() |
| 134 | + |
| 135 | + # Calculate overall metrics |
| 136 | + overall_disharmony = mean([f.avg_disharmony for f in self.file_analyses.values()]) |
| 137 | + |
| 138 | + return { |
| 139 | + "total_files": len(self.file_analyses), |
| 140 | + "clusters": clusters, |
| 141 | + "outliers": outliers, |
| 142 | + "overall_disharmony": overall_disharmony, |
| 143 | + } |
| 144 | + |
| 145 | + def _cluster_by_dimension(self) -> Dict[str, List[FileAnalysis]]: |
| 146 | + """Group files by dominant semantic dimension""" |
| 147 | + clusters = defaultdict(list) |
| 148 | + |
| 149 | + for analysis in self.file_analyses.values(): |
| 150 | + clusters[analysis.dominant_dimension].append(analysis) |
| 151 | + |
| 152 | + return dict(clusters) |
| 153 | + |
| 154 | + def _find_outliers(self, threshold: float = 0.15) -> List[FileAnalysis]: |
| 155 | + """Find files with no clear dominant dimension (semantic confusion)""" |
| 156 | + outliers = [] |
| 157 | + |
| 158 | + for analysis in self.file_analyses.values(): |
| 159 | + l, j, p, w = analysis.coordinates |
| 160 | + |
| 161 | + # Check if all dimensions are roughly equal (balanced = confused) |
| 162 | + max_coord = max(l, j, p, w) |
| 163 | + min_coord = min(l, j, p, w) |
| 164 | + |
| 165 | + if max_coord - min_coord < threshold: |
| 166 | + outliers.append(analysis) |
| 167 | + |
| 168 | + return outliers |
| 169 | + |
| 170 | + def print_report(self, report: Dict): |
| 171 | + """Print human-readable report""" |
| 172 | + print("\n") |
| 173 | + print("=" * 70) |
| 174 | + print("SEMANTIC CODEBASE MAP") |
| 175 | + print("=" * 70) |
| 176 | + |
| 177 | + clusters = report["clusters"] |
| 178 | + |
| 179 | + # Print each cluster |
| 180 | + for dimension in ["Love", "Justice", "Power", "Wisdom"]: |
| 181 | + if dimension not in clusters: |
| 182 | + continue |
| 183 | + |
| 184 | + files = clusters[dimension] |
| 185 | + if not files: |
| 186 | + continue |
| 187 | + |
| 188 | + # Calculate cluster statistics |
| 189 | + avg_l = mean([f.coordinates[0] for f in files]) |
| 190 | + avg_j = mean([f.coordinates[1] for f in files]) |
| 191 | + avg_p = mean([f.coordinates[2] for f in files]) |
| 192 | + avg_w = mean([f.coordinates[3] for f in files]) |
| 193 | + |
| 194 | + icon = {"Love": "💛", "Justice": "⚖️", "Power": "⚡", "Wisdom": "📚"}[dimension] |
| 195 | + |
| 196 | + print(f"\n{icon} {dimension.upper()} CLUSTER ({len(files)} files)") |
| 197 | + print(f" Avg Coordinates: L={avg_l:.2f}, J={avg_j:.2f}, P={avg_p:.2f}, W={avg_w:.2f}") |
| 198 | + print(f" Files:") |
| 199 | + |
| 200 | + # Show top files by function count |
| 201 | + sorted_files = sorted(files, key=lambda f: f.function_count, reverse=True) |
| 202 | + for file in sorted_files[:5]: # Top 5 |
| 203 | + rel_path = os.path.relpath(file.path, self.codebase_path) |
| 204 | + print( |
| 205 | + f" - {rel_path:40s} " |
| 206 | + f"({file.function_count} funcs, disharmony: {file.avg_disharmony:.2f})" |
| 207 | + ) |
| 208 | + |
| 209 | + if len(files) > 5: |
| 210 | + print(f" ... and {len(files) - 5} more") |
| 211 | + |
| 212 | + # Print outliers |
| 213 | + outliers = report["outliers"] |
| 214 | + if outliers: |
| 215 | + print(f"\n⚠️ OUTLIERS - Semantically Unclear ({len(outliers)} files)") |
| 216 | + print(" Files with no clear dominant dimension:") |
| 217 | + for file in outliers[:5]: |
| 218 | + rel_path = os.path.relpath(file.path, self.codebase_path) |
| 219 | + l, j, p, w = file.coordinates |
| 220 | + print(f" - {rel_path:40s} L={l:.2f} J={j:.2f} P={p:.2f} W={w:.2f}") |
| 221 | + |
| 222 | + # Overall metrics |
| 223 | + print(f"\n📊 OVERALL METRICS") |
| 224 | + print(f" Total files analyzed: {report['total_files']}") |
| 225 | + print(f" Average disharmony: {report['overall_disharmony']:.2f}") |
| 226 | + |
| 227 | + # Health assessment |
| 228 | + avg_dis = report["overall_disharmony"] |
| 229 | + if avg_dis < 0.3: |
| 230 | + health = "EXCELLENT ✅" |
| 231 | + elif avg_dis < 0.5: |
| 232 | + health = "GOOD ✓" |
| 233 | + elif avg_dis < 0.7: |
| 234 | + health = "MODERATE ⚠️" |
| 235 | + else: |
| 236 | + health = "CONCERNING 🚨" |
| 237 | + |
| 238 | + print(f" Codebase health: {health}") |
| 239 | + |
| 240 | + print("=" * 70) |
| 241 | + |
| 242 | + |
| 243 | +if __name__ == "__main__": |
| 244 | + import sys |
| 245 | + |
| 246 | + if len(sys.argv) > 1: |
| 247 | + codebase = sys.argv[1] |
| 248 | + else: |
| 249 | + codebase = "harmonizer" # Default: analyze harmonizer itself |
| 250 | + |
| 251 | + mapper = LegacyCodeMapper(codebase) |
| 252 | + report = mapper.analyze_codebase() |
| 253 | + mapper.print_report(report) |
0 commit comments