|
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import logging |
| 8 | +import os |
8 | 9 | import sys |
9 | 10 | import time |
10 | 11 | from collections import defaultdict |
@@ -218,63 +219,89 @@ def analyze(self) -> ProjectInfo: |
218 | 219 |
|
219 | 220 | def _scan_files(self): |
220 | 221 | """Scan and parse all source files.""" |
221 | | - for fp in self.root_path.rglob('*'): |
222 | | - if not fp.is_file(): |
223 | | - continue |
224 | | - |
225 | | - # Skip ignored directories |
226 | | - if any(d in fp.parts for d in self.IGNORE_DIRS): |
227 | | - continue |
228 | | - |
229 | | - # Skip ignored files |
230 | | - if fp.name in self.IGNORE_FILES: |
231 | | - continue |
| 222 | + scan_start = time.time() |
| 223 | + files_seen = 0 |
| 224 | + files_parsed = 0 |
| 225 | + files_matched = 0 |
| 226 | + scan_progress_every = 500 |
| 227 | + |
| 228 | + for root, dirnames, filenames in os.walk(self.root_path): |
| 229 | + dirnames[:] = [d for d in dirnames if d not in self.IGNORE_DIRS] |
| 230 | + for filename in filenames: |
| 231 | + files_seen += 1 |
| 232 | + fp = Path(root) / filename |
| 233 | + |
| 234 | + if filename in self.IGNORE_FILES: |
| 235 | + continue |
232 | 236 |
|
233 | | - ext = fp.suffix.lower() |
234 | | - language = self.LANGUAGE_EXTENSIONS.get(ext) |
235 | | - if language is None and ext == '': |
236 | | - try: |
237 | | - with fp.open('r', encoding='utf-8', errors='ignore') as f: |
238 | | - language = self._language_from_shebang(f.readline()) |
239 | | - except Exception: |
240 | | - language = None |
| 237 | + ext = fp.suffix.lower() |
| 238 | + language = self.LANGUAGE_EXTENSIONS.get(ext) |
| 239 | + if language is None and ext == '': |
| 240 | + try: |
| 241 | + with fp.open('r', encoding='utf-8', errors='ignore') as f: |
| 242 | + language = self._language_from_shebang(f.readline()) |
| 243 | + except Exception: |
| 244 | + language = None |
241 | 245 |
|
242 | | - if language is None: |
243 | | - continue |
| 246 | + if language is None: |
| 247 | + continue |
244 | 248 |
|
245 | | - self.languages[language] += 1 |
| 249 | + files_matched += 1 |
| 250 | + self.languages[language] += 1 |
246 | 251 |
|
247 | | - # Read file |
248 | | - try: |
249 | | - content = fp.read_text(encoding='utf-8', errors='ignore') |
250 | | - except Exception: |
251 | | - continue |
| 252 | + if self.verbose and files_seen > 0 and (files_seen % scan_progress_every) == 0: |
| 253 | + log.info( |
| 254 | + "Scan progress: seen=%d matched=%d parsed=%d modules=%d time=%.2fs", |
| 255 | + files_seen, |
| 256 | + files_matched, |
| 257 | + files_parsed, |
| 258 | + len(self.modules), |
| 259 | + time.time() - scan_start, |
| 260 | + ) |
252 | 261 |
|
253 | | - rel_path = str(fp.relative_to(self.root_path)) |
| 262 | + try: |
| 263 | + content = fp.read_text(encoding='utf-8', errors='ignore') |
| 264 | + except Exception: |
| 265 | + continue |
254 | 266 |
|
255 | | - # Try Tree-sitter first, then fallback |
256 | | - module = None |
257 | | - try: |
258 | | - if self.ts_parser and self.ts_parser.is_available(language): |
259 | | - module = self.ts_parser.parse(rel_path, content, language) |
260 | | - except Exception as e: |
261 | | - if self.verbose: |
262 | | - log.debug("Tree-sitter parser failed for %s: %s", rel_path, e) |
| 267 | + try: |
| 268 | + rel_path = str(fp.relative_to(self.root_path)) |
| 269 | + except Exception: |
| 270 | + rel_path = str(fp) |
263 | 271 |
|
264 | | - if module is None: |
| 272 | + module = None |
265 | 273 | try: |
266 | | - module = self.fallback_parser.parse(rel_path, content, language) |
| 274 | + if self.ts_parser and self.ts_parser.is_available(language): |
| 275 | + module = self.ts_parser.parse(rel_path, content, language) |
267 | 276 | except Exception as e: |
268 | 277 | if self.verbose: |
269 | | - log.debug("Fallback parser failed for %s: %s", rel_path, e) |
270 | | - continue |
| 278 | + log.debug("Tree-sitter parser failed for %s: %s", rel_path, e) |
| 279 | + |
| 280 | + if module is None: |
| 281 | + try: |
| 282 | + module = self.fallback_parser.parse(rel_path, content, language) |
| 283 | + except Exception as e: |
| 284 | + if self.verbose: |
| 285 | + log.debug("Fallback parser failed for %s: %s", rel_path, e) |
| 286 | + continue |
| 287 | + |
| 288 | + if module: |
| 289 | + files_parsed += 1 |
| 290 | + try: |
| 291 | + module.file_bytes = fp.stat().st_size |
| 292 | + except Exception: |
| 293 | + module.file_bytes = len(content.encode('utf-8', errors='ignore')) |
| 294 | + self.modules.append(module) |
271 | 295 |
|
272 | | - if module: |
273 | | - try: |
274 | | - module.file_bytes = fp.stat().st_size |
275 | | - except Exception: |
276 | | - module.file_bytes = len(content.encode('utf-8', errors='ignore')) |
277 | | - self.modules.append(module) |
| 296 | + if self.verbose: |
| 297 | + log.info( |
| 298 | + "Scan finished: seen=%d matched=%d parsed=%d modules=%d time=%.2fs", |
| 299 | + files_seen, |
| 300 | + files_matched, |
| 301 | + files_parsed, |
| 302 | + len(self.modules), |
| 303 | + time.time() - scan_start, |
| 304 | + ) |
278 | 305 |
|
279 | 306 | def _detect_entrypoints(self) -> List[str]: |
280 | 307 | """Detect project entry points.""" |
|
0 commit comments