|
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
| 5 | +import os |
| 6 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
5 | 7 | from pathlib import Path |
6 | 8 | from typing import Optional |
7 | 9 |
|
|
19 | 21 |
|
20 | 22 | TOON_EXTENSIONS = {".toon.yaml", ".toon"} |
21 | 23 |
|
| 24 | +_MAX_WORKERS = min(os.cpu_count() or 1, 8) |
| 25 | + |
| 26 | + |
| 27 | +class _CompiledPatterns: |
| 28 | + """Pre-compiled pattern set: exact names in a frozenset, globs in one regex.""" |
| 29 | + |
| 30 | + __slots__ = ("exact", "regex", "is_empty") |
| 31 | + |
| 32 | + def __init__(self, exact: frozenset[str], regex, is_empty: bool): |
| 33 | + self.exact = exact |
| 34 | + self.regex = regex |
| 35 | + self.is_empty = is_empty |
| 36 | + |
| 37 | + |
| 38 | +def _compile_patterns(raw: list[str]) -> _CompiledPatterns: |
| 39 | + """Split *raw* glob strings into an exact-match set and a combined regex.""" |
| 40 | + import fnmatch |
| 41 | + import re |
| 42 | + |
| 43 | + if not raw: |
| 44 | + return _CompiledPatterns(frozenset(), None, True) |
| 45 | + |
| 46 | + exact: set[str] = set() |
| 47 | + regex_parts: list[str] = [] |
| 48 | + |
| 49 | + for pat in dict.fromkeys(raw): # deduplicate, preserve order |
| 50 | + if any(c in pat for c in ("*", "?", "[", "]")): |
| 51 | + regex_parts.append(fnmatch.translate(pat)) |
| 52 | + else: |
| 53 | + exact.add(pat) |
| 54 | + |
| 55 | + compiled_re = re.compile("|".join(regex_parts)) if regex_parts else None |
| 56 | + return _CompiledPatterns(frozenset(exact), compiled_re, False) |
| 57 | + |
| 58 | + |
| 59 | +def _validate_single_file(file_path: Path, settings: VallmSettings): |
| 60 | + """Validate a single file (top-level for thread-pool compatibility). |
| 61 | +
|
| 62 | + Returns (file_path, lang_obj, result, error_str) tuple. |
| 63 | + """ |
| 64 | + from vallm.validators.file_cache import get_file_cache |
| 65 | + |
| 66 | + lang_obj = detect_language(file_path) |
| 67 | + if lang_obj is None: |
| 68 | + return file_path, None, None, "Unsupported file type" |
| 69 | + |
| 70 | + cache = get_file_cache() |
| 71 | + cached = cache.get(file_path) |
| 72 | + if cached is not None: |
| 73 | + return file_path, lang_obj, cached, None |
| 74 | + |
| 75 | + try: |
| 76 | + code = file_path.read_text(encoding="utf-8") |
| 77 | + except UnicodeDecodeError: |
| 78 | + return file_path, None, None, "Unable to read file (binary?)" |
| 79 | + |
| 80 | + proposal = Proposal( |
| 81 | + code=code, |
| 82 | + language=lang_obj.tree_sitter_id, |
| 83 | + filename=str(file_path), |
| 84 | + ) |
| 85 | + result = validate(proposal, settings) |
| 86 | + cache.set(file_path, result) |
| 87 | + return file_path, lang_obj, result, None |
| 88 | + |
22 | 89 |
|
23 | 90 | class BatchProcessor: |
24 | 91 | """Handles batch validation of multiple files.""" |
@@ -134,19 +201,12 @@ def _filter_files( |
134 | 201 | return filtered_files |
135 | 202 |
|
136 | 203 | def _parse_filter_patterns(self, include: Optional[str], exclude: Optional[str]) -> dict: |
137 | | - """Parse include and exclude patterns.""" |
138 | | - import fnmatch |
139 | | - |
140 | | - patterns = {"include": [], "exclude": []} |
141 | | - |
142 | | - if include: |
143 | | - patterns["include"] = include.split(",") |
144 | | - |
| 204 | + """Parse include and exclude patterns into compiled matchers.""" |
| 205 | + raw_exclude: list[str] = [] |
145 | 206 | if exclude: |
146 | | - patterns["exclude"] = exclude.split(",") |
147 | | - |
148 | | - # Add default exclude patterns |
149 | | - patterns["exclude"].extend([ |
| 207 | + raw_exclude = exclude.split(",") |
| 208 | + |
| 209 | + raw_exclude.extend([ |
150 | 210 | # Python |
151 | 211 | "*.pyc", "*.pyo", "*.pyd", "__pycache__", ".pytest_cache", |
152 | 212 | "*.egg-info", "build", "dist", ".tox", ".coverage", "htmlcov", |
@@ -180,44 +240,47 @@ def _parse_filter_patterns(self, include: Optional[str], exclude: Optional[str]) |
180 | 240 | # Large data files |
181 | 241 | "*.jsonl", "*.parquet", "*.csv", "*.tsv", |
182 | 242 | ]) |
183 | | - |
184 | | - return patterns |
185 | | - |
186 | | - def _should_exclude_file(self, file_path: Path, exclude_patterns: list[str]) -> bool: |
187 | | - """Check if file should be excluded based on patterns.""" |
188 | | - import fnmatch |
189 | | - |
190 | | - file_str = str(file_path) |
| 243 | + |
| 244 | + raw_include: list[str] = [] |
| 245 | + if include: |
| 246 | + raw_include = include.split(",") |
| 247 | + |
| 248 | + return { |
| 249 | + "exclude": _compile_patterns(raw_exclude), |
| 250 | + "include": _compile_patterns(raw_include), |
| 251 | + } |
| 252 | + |
| 253 | + def _should_exclude_file(self, file_path: Path, compiled: _CompiledPatterns) -> bool: |
| 254 | + """Check if file should be excluded based on pre-compiled patterns.""" |
191 | 255 | file_name = file_path.name |
192 | | - file_str_lower = file_str.lower() |
| 256 | + file_str_lower = str(file_path).lower() |
193 | 257 |
|
194 | 258 | if any(file_str_lower.endswith(ext) for ext in TOON_EXTENSIONS): |
195 | 259 | return True |
196 | | - |
197 | | - for pattern in exclude_patterns: |
198 | | - # Check full path match |
199 | | - if fnmatch.fnmatch(file_str, pattern): |
200 | | - return True |
201 | | - # Check filename match |
202 | | - if fnmatch.fnmatch(file_name, pattern): |
203 | | - return True |
204 | | - # Check if any parent directory matches the pattern |
205 | | - for parent in file_path.parts: |
206 | | - if fnmatch.fnmatch(parent, pattern): |
207 | | - return True |
| 260 | + |
| 261 | + if file_name in compiled.exact or any(p in compiled.exact for p in file_path.parts): |
| 262 | + return True |
| 263 | + |
| 264 | + if compiled.regex and ( |
| 265 | + compiled.regex.search(file_name) |
| 266 | + or any(compiled.regex.search(p) for p in file_path.parts) |
| 267 | + ): |
| 268 | + return True |
| 269 | + |
208 | 270 | return False |
209 | | - |
210 | | - def _matches_include_pattern(self, file_path: Path, include_patterns: list[str]) -> bool: |
211 | | - """Check if file matches include patterns.""" |
212 | | - import fnmatch |
213 | | - |
214 | | - if not include_patterns: |
| 271 | + |
| 272 | + def _matches_include_pattern(self, file_path: Path, compiled: _CompiledPatterns) -> bool: |
| 273 | + """Check if file matches pre-compiled include patterns.""" |
| 274 | + if compiled.is_empty: |
215 | 275 | return True |
216 | | - |
217 | | - file_str = str(file_path) |
218 | | - for pattern in include_patterns: |
219 | | - if fnmatch.fnmatch(file_str, pattern) or fnmatch.fnmatch(file_path.name, pattern): |
220 | | - return True |
| 276 | + |
| 277 | + file_name = file_path.name |
| 278 | + if file_name in compiled.exact: |
| 279 | + return True |
| 280 | + |
| 281 | + if compiled.regex and compiled.regex.search(file_name): |
| 282 | + return True |
| 283 | + |
221 | 284 | return False |
222 | 285 |
|
223 | 286 | def _handle_no_files_found(self, output_format: str) -> None: |
@@ -302,31 +365,105 @@ def _process_files( |
302 | 365 | show_issues: bool, |
303 | 366 | ) -> tuple[dict, list, int, list]: |
304 | 367 | """Process all files for validation.""" |
| 368 | + use_parallel = ( |
| 369 | + not fail_fast |
| 370 | + and not verbose |
| 371 | + and len(filtered_files) >= 4 |
| 372 | + and _MAX_WORKERS > 1 |
| 373 | + ) |
| 374 | + if use_parallel: |
| 375 | + return self._process_files_parallel( |
| 376 | + filtered_files, settings, output_format, show_issues, |
| 377 | + ) |
| 378 | + return self._process_files_sequential( |
| 379 | + filtered_files, settings, output_format, fail_fast, verbose, show_issues, |
| 380 | + ) |
| 381 | + |
| 382 | + def _process_files_parallel( |
| 383 | + self, |
| 384 | + filtered_files: list[Path], |
| 385 | + settings: VallmSettings, |
| 386 | + output_format: str, |
| 387 | + show_issues: bool, |
| 388 | + ) -> tuple[dict, list, int, list]: |
| 389 | + """Process files using a thread pool for CPU-bound validators.""" |
305 | 390 | results_by_language: dict = {} |
306 | 391 | failed_files: list = [] |
307 | 392 | passed_count = 0 |
308 | 393 | total = len(filtered_files) |
| 394 | + done = 0 |
| 395 | + |
| 396 | + with ThreadPoolExecutor(max_workers=_MAX_WORKERS) as pool: |
| 397 | + futures = { |
| 398 | + pool.submit(_validate_single_file, fp, settings): fp |
| 399 | + for fp in filtered_files |
| 400 | + } |
| 401 | + for future in as_completed(futures): |
| 402 | + done += 1 |
| 403 | + try: |
| 404 | + file_path, lang_obj, result, error = future.result() |
| 405 | + except Exception as e: |
| 406 | + file_path = futures[future] |
| 407 | + failed_files.append((file_path, f"Error: {str(e)}")) |
| 408 | + continue |
| 409 | + |
| 410 | + if error is not None: |
| 411 | + failed_files.append((file_path, error)) |
| 412 | + continue |
| 413 | + |
| 414 | + self._show_progress(done, total, file_path, output_format) |
| 415 | + passed = self._handle_validation_result( |
| 416 | + result, file_path, lang_obj, output_format, |
| 417 | + show_issues, results_by_language, failed_files, |
| 418 | + ) |
| 419 | + if passed: |
| 420 | + passed_count += 1 |
| 421 | + |
| 422 | + return results_by_language, failed_files, passed_count, filtered_files |
| 423 | + |
| 424 | + def _process_files_sequential( |
| 425 | + self, |
| 426 | + filtered_files: list[Path], |
| 427 | + settings: VallmSettings, |
| 428 | + output_format: str, |
| 429 | + fail_fast: bool, |
| 430 | + verbose: bool, |
| 431 | + show_issues: bool, |
| 432 | + ) -> tuple[dict, list, int, list]: |
| 433 | + """Process files sequentially (used for fail_fast / verbose modes).""" |
| 434 | + from vallm.validators.file_cache import get_file_cache |
| 435 | + |
| 436 | + results_by_language: dict = {} |
| 437 | + failed_files: list = [] |
| 438 | + passed_count = 0 |
| 439 | + total = len(filtered_files) |
| 440 | + cache = get_file_cache() |
309 | 441 |
|
310 | 442 | for i, file_path in enumerate(filtered_files, 1): |
311 | 443 | try: |
312 | 444 | self._show_progress(i, total, file_path, output_format) |
313 | 445 |
|
314 | | - code = self._read_file_text(file_path) |
315 | | - if code is None: |
316 | | - failed_files.append((file_path, "Unable to read file (binary?)")) |
317 | | - continue |
318 | | - |
319 | 446 | lang_obj = self._detect_file_language(file_path) |
320 | 447 | if lang_obj is None: |
321 | 448 | failed_files.append((file_path, "Unsupported file type")) |
322 | 449 | continue |
323 | 450 |
|
324 | | - proposal = Proposal( |
325 | | - code=code, |
326 | | - language=lang_obj.tree_sitter_id, |
327 | | - filename=str(file_path), |
328 | | - ) |
329 | | - result = validate(proposal, settings) |
| 451 | + cached = cache.get(file_path) |
| 452 | + if cached is not None: |
| 453 | + result = cached |
| 454 | + else: |
| 455 | + code = self._read_file_text(file_path) |
| 456 | + if code is None: |
| 457 | + failed_files.append((file_path, "Unable to read file (binary?)")) |
| 458 | + continue |
| 459 | + |
| 460 | + proposal = Proposal( |
| 461 | + code=code, |
| 462 | + language=lang_obj.tree_sitter_id, |
| 463 | + filename=str(file_path), |
| 464 | + ) |
| 465 | + result = validate(proposal, settings) |
| 466 | + cache.set(file_path, result) |
330 | 467 |
|
331 | 468 | passed = self._handle_validation_result( |
332 | 469 | result, file_path, lang_obj, output_format, |
|
0 commit comments