Skip to content

Commit d30dc93

Browse files
perf: 6 validation optimizations for faster batch processing
1. Cache tree-sitter parsers: lru_cache(32) on get_parser() in ast_compare.py and all import validators (go, js/ts, rust) — avoids re-creating parsers per file 2. Parallel ThreadPoolExecutor: batch_processor dispatches files to thread pool (max 8 workers) when >=4 files and not fail_fast/verbose — overlaps CPU+I/O 3. File validation cache (mtime+size): new FileValidationCache in file_cache.py skips re-validation of unchanged files — huge win in pyqual iteration loops 4. Compiled exclude patterns: _CompiledPatterns splits exact names (frozenset) from globs (single compiled regex) — eliminates ~25k fnmatch calls per batch 5. Module exists cache: dict cache for importlib.util.find_spec results + one-time cwd scan for local packages — avoids repeated filesystem lookups 6. Pre-read via thread pool: file I/O naturally overlapped by ThreadPoolExecutor in _validate_single_file — no separate pre-read pass needed
1 parent 73a3bfb commit d30dc93

File tree

8 files changed

+330
-74
lines changed

8 files changed

+330
-74
lines changed

src/vallm/cli/batch_processor.py

Lines changed: 192 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from __future__ import annotations
44

5+
import os
6+
from concurrent.futures import ThreadPoolExecutor, as_completed
57
from pathlib import Path
68
from typing import Optional
79

@@ -19,6 +21,71 @@
1921

2022
TOON_EXTENSIONS = {".toon.yaml", ".toon"}
2123

24+
_MAX_WORKERS = min(os.cpu_count() or 1, 8)
25+
26+
27+
class _CompiledPatterns:
28+
"""Pre-compiled pattern set: exact names in a frozenset, globs in one regex."""
29+
30+
__slots__ = ("exact", "regex", "is_empty")
31+
32+
def __init__(self, exact: frozenset[str], regex, is_empty: bool):
33+
self.exact = exact
34+
self.regex = regex
35+
self.is_empty = is_empty
36+
37+
38+
def _compile_patterns(raw: list[str]) -> _CompiledPatterns:
39+
"""Split *raw* glob strings into an exact-match set and a combined regex."""
40+
import fnmatch
41+
import re
42+
43+
if not raw:
44+
return _CompiledPatterns(frozenset(), None, True)
45+
46+
exact: set[str] = set()
47+
regex_parts: list[str] = []
48+
49+
for pat in dict.fromkeys(raw): # deduplicate, preserve order
50+
if any(c in pat for c in ("*", "?", "[", "]")):
51+
regex_parts.append(fnmatch.translate(pat))
52+
else:
53+
exact.add(pat)
54+
55+
compiled_re = re.compile("|".join(regex_parts)) if regex_parts else None
56+
return _CompiledPatterns(frozenset(exact), compiled_re, False)
57+
58+
59+
def _validate_single_file(file_path: Path, settings: VallmSettings):
60+
"""Validate a single file (top-level for thread-pool compatibility).
61+
62+
Returns (file_path, lang_obj, result, error_str) tuple.
63+
"""
64+
from vallm.validators.file_cache import get_file_cache
65+
66+
lang_obj = detect_language(file_path)
67+
if lang_obj is None:
68+
return file_path, None, None, "Unsupported file type"
69+
70+
cache = get_file_cache()
71+
cached = cache.get(file_path)
72+
if cached is not None:
73+
return file_path, lang_obj, cached, None
74+
75+
try:
76+
code = file_path.read_text(encoding="utf-8")
77+
except UnicodeDecodeError:
78+
return file_path, None, None, "Unable to read file (binary?)"
79+
80+
proposal = Proposal(
81+
code=code,
82+
language=lang_obj.tree_sitter_id,
83+
filename=str(file_path),
84+
)
85+
result = validate(proposal, settings)
86+
cache.set(file_path, result)
87+
return file_path, lang_obj, result, None
88+
2289

2390
class BatchProcessor:
2491
"""Handles batch validation of multiple files."""
@@ -134,19 +201,12 @@ def _filter_files(
134201
return filtered_files
135202

136203
def _parse_filter_patterns(self, include: Optional[str], exclude: Optional[str]) -> dict:
137-
"""Parse include and exclude patterns."""
138-
import fnmatch
139-
140-
patterns = {"include": [], "exclude": []}
141-
142-
if include:
143-
patterns["include"] = include.split(",")
144-
204+
"""Parse include and exclude patterns into compiled matchers."""
205+
raw_exclude: list[str] = []
145206
if exclude:
146-
patterns["exclude"] = exclude.split(",")
147-
148-
# Add default exclude patterns
149-
patterns["exclude"].extend([
207+
raw_exclude = exclude.split(",")
208+
209+
raw_exclude.extend([
150210
# Python
151211
"*.pyc", "*.pyo", "*.pyd", "__pycache__", ".pytest_cache",
152212
"*.egg-info", "build", "dist", ".tox", ".coverage", "htmlcov",
@@ -180,44 +240,47 @@ def _parse_filter_patterns(self, include: Optional[str], exclude: Optional[str])
180240
# Large data files
181241
"*.jsonl", "*.parquet", "*.csv", "*.tsv",
182242
])
183-
184-
return patterns
185-
186-
def _should_exclude_file(self, file_path: Path, exclude_patterns: list[str]) -> bool:
187-
"""Check if file should be excluded based on patterns."""
188-
import fnmatch
189-
190-
file_str = str(file_path)
243+
244+
raw_include: list[str] = []
245+
if include:
246+
raw_include = include.split(",")
247+
248+
return {
249+
"exclude": _compile_patterns(raw_exclude),
250+
"include": _compile_patterns(raw_include),
251+
}
252+
253+
def _should_exclude_file(self, file_path: Path, compiled: _CompiledPatterns) -> bool:
254+
"""Check if file should be excluded based on pre-compiled patterns."""
191255
file_name = file_path.name
192-
file_str_lower = file_str.lower()
256+
file_str_lower = str(file_path).lower()
193257

194258
if any(file_str_lower.endswith(ext) for ext in TOON_EXTENSIONS):
195259
return True
196-
197-
for pattern in exclude_patterns:
198-
# Check full path match
199-
if fnmatch.fnmatch(file_str, pattern):
200-
return True
201-
# Check filename match
202-
if fnmatch.fnmatch(file_name, pattern):
203-
return True
204-
# Check if any parent directory matches the pattern
205-
for parent in file_path.parts:
206-
if fnmatch.fnmatch(parent, pattern):
207-
return True
260+
261+
if file_name in compiled.exact or any(p in compiled.exact for p in file_path.parts):
262+
return True
263+
264+
if compiled.regex and (
265+
compiled.regex.search(file_name)
266+
or any(compiled.regex.search(p) for p in file_path.parts)
267+
):
268+
return True
269+
208270
return False
209-
210-
def _matches_include_pattern(self, file_path: Path, include_patterns: list[str]) -> bool:
211-
"""Check if file matches include patterns."""
212-
import fnmatch
213-
214-
if not include_patterns:
271+
272+
def _matches_include_pattern(self, file_path: Path, compiled: _CompiledPatterns) -> bool:
273+
"""Check if file matches pre-compiled include patterns."""
274+
if compiled.is_empty:
215275
return True
216-
217-
file_str = str(file_path)
218-
for pattern in include_patterns:
219-
if fnmatch.fnmatch(file_str, pattern) or fnmatch.fnmatch(file_path.name, pattern):
220-
return True
276+
277+
file_name = file_path.name
278+
if file_name in compiled.exact:
279+
return True
280+
281+
if compiled.regex and compiled.regex.search(file_name):
282+
return True
283+
221284
return False
222285

223286
def _handle_no_files_found(self, output_format: str) -> None:
@@ -302,31 +365,105 @@ def _process_files(
302365
show_issues: bool,
303366
) -> tuple[dict, list, int, list]:
304367
"""Process all files for validation."""
368+
use_parallel = (
369+
not fail_fast
370+
and not verbose
371+
and len(filtered_files) >= 4
372+
and _MAX_WORKERS > 1
373+
)
374+
if use_parallel:
375+
return self._process_files_parallel(
376+
filtered_files, settings, output_format, show_issues,
377+
)
378+
return self._process_files_sequential(
379+
filtered_files, settings, output_format, fail_fast, verbose, show_issues,
380+
)
381+
382+
def _process_files_parallel(
383+
self,
384+
filtered_files: list[Path],
385+
settings: VallmSettings,
386+
output_format: str,
387+
show_issues: bool,
388+
) -> tuple[dict, list, int, list]:
389+
"""Process files using a thread pool for CPU-bound validators."""
305390
results_by_language: dict = {}
306391
failed_files: list = []
307392
passed_count = 0
308393
total = len(filtered_files)
394+
done = 0
395+
396+
with ThreadPoolExecutor(max_workers=_MAX_WORKERS) as pool:
397+
futures = {
398+
pool.submit(_validate_single_file, fp, settings): fp
399+
for fp in filtered_files
400+
}
401+
for future in as_completed(futures):
402+
done += 1
403+
try:
404+
file_path, lang_obj, result, error = future.result()
405+
except Exception as e:
406+
file_path = futures[future]
407+
failed_files.append((file_path, f"Error: {str(e)}"))
408+
continue
409+
410+
if error is not None:
411+
failed_files.append((file_path, error))
412+
continue
413+
414+
self._show_progress(done, total, file_path, output_format)
415+
passed = self._handle_validation_result(
416+
result, file_path, lang_obj, output_format,
417+
show_issues, results_by_language, failed_files,
418+
)
419+
if passed:
420+
passed_count += 1
421+
422+
return results_by_language, failed_files, passed_count, filtered_files
423+
424+
def _process_files_sequential(
425+
self,
426+
filtered_files: list[Path],
427+
settings: VallmSettings,
428+
output_format: str,
429+
fail_fast: bool,
430+
verbose: bool,
431+
show_issues: bool,
432+
) -> tuple[dict, list, int, list]:
433+
"""Process files sequentially (used for fail_fast / verbose modes)."""
434+
from vallm.validators.file_cache import get_file_cache
435+
436+
results_by_language: dict = {}
437+
failed_files: list = []
438+
passed_count = 0
439+
total = len(filtered_files)
440+
cache = get_file_cache()
309441

310442
for i, file_path in enumerate(filtered_files, 1):
311443
try:
312444
self._show_progress(i, total, file_path, output_format)
313445

314-
code = self._read_file_text(file_path)
315-
if code is None:
316-
failed_files.append((file_path, "Unable to read file (binary?)"))
317-
continue
318-
319446
lang_obj = self._detect_file_language(file_path)
320447
if lang_obj is None:
321448
failed_files.append((file_path, "Unsupported file type"))
322449
continue
323450

324-
proposal = Proposal(
325-
code=code,
326-
language=lang_obj.tree_sitter_id,
327-
filename=str(file_path),
328-
)
329-
result = validate(proposal, settings)
451+
cached = cache.get(file_path)
452+
if cached is not None:
453+
result = cached
454+
else:
455+
code = self._read_file_text(file_path)
456+
if code is None:
457+
failed_files.append((file_path, "Unable to read file (binary?)"))
458+
continue
459+
460+
proposal = Proposal(
461+
code=code,
462+
language=lang_obj.tree_sitter_id,
463+
filename=str(file_path),
464+
)
465+
result = validate(proposal, settings)
466+
cache.set(file_path, result)
330467

331468
passed = self._handle_validation_result(
332469
result, file_path, lang_obj, output_format,

src/vallm/core/ast_compare.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,21 @@
33
from __future__ import annotations
44

55
import ast
6+
from functools import lru_cache
67
from typing import Optional
78

89
from tree_sitter_language_pack import get_parser
910

1011

12+
@lru_cache(maxsize=32)
13+
def _cached_get_parser(language: str):
14+
"""Return a cached tree-sitter parser for *language*."""
15+
return get_parser(language)
16+
17+
1118
def parse_code(code: str, language: str = "python"):
1219
"""Parse code using tree-sitter and return the tree."""
13-
parser = get_parser(language)
20+
parser = _cached_get_parser(language)
1421
return parser.parse(code.encode("utf-8"))
1522

1623

0 commit comments

Comments
 (0)