Skip to content

Commit 9b10537

Browse files
committed
Cache globbed files for speed increase
1 parent 8072181 commit 9b10537

File tree

3 files changed

+311
-2
lines changed

3 files changed

+311
-2
lines changed

src/yamlfix/entrypoints/cli.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,38 @@
1717
log = logging.getLogger(__name__)
1818

1919

20+
_GLOB_CACHE = {}
21+
22+
23+
def _clear_glob_cache() -> None:
24+
"""Clear the glob cache to prevent memory leaks in long-running processes."""
25+
_GLOB_CACHE.clear()
26+
27+
28+
def _glob_cache(dir_: Path, glob: str) -> set:
29+
cache_key = (str(dir_), glob, "g")
30+
if cache_key not in _GLOB_CACHE:
31+
_GLOB_CACHE[cache_key] = set(dir_.glob(glob))
32+
return _GLOB_CACHE[cache_key]
33+
34+
35+
def _rglob_cache(dir_: Path, glob: str) -> set:
36+
cache_key = (str(dir_), glob, "r")
37+
if cache_key not in _GLOB_CACHE:
38+
_GLOB_CACHE[cache_key] = set(dir_.rglob(glob))
39+
return _GLOB_CACHE[cache_key]
40+
41+
2042
def _matches_any_glob(
2143
file_to_test: Path, dir_: Path, globs: Optional[List[str]]
2244
) -> bool:
23-
return any(file_to_test in dir_.glob(glob) for glob in (globs or []))
45+
return any(file_to_test in _glob_cache(dir_, glob) for glob in (globs or []))
2446

2547

2648
def _find_all_yaml_files(
2749
dir_: Path, include_globs: Optional[List[str]], exclude_globs: Optional[List[str]]
2850
) -> List[Path]:
29-
files = [dir_.rglob(glob) for glob in (include_globs or [])]
51+
files = [list(_rglob_cache(dir_, glob)) for glob in (include_globs or [])]
3052
return [
3153
file
3254
for list_ in files
@@ -121,6 +143,9 @@ def cli( # pylint: disable=too-many-arguments
121143
if fixed_code is not None:
122144
print(fixed_code, end="")
123145

146+
# Clear cache to prevent memory leaks in long-running processes
147+
_clear_glob_cache()
148+
124149
if changed and check:
125150
sys.exit(1)
126151

tests/e2e/test_cli.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
import logging
44
import os
55
import re
6+
import time
67
from itertools import product
78
from pathlib import Path
89
from textwrap import dedent
10+
from unittest import mock
911

1012
import py # type: ignore
1113
import pytest
@@ -420,3 +422,93 @@ def test_do_not_read_folders_as_files(runner: CliRunner, tmpdir: py.path.local)
420422
result = runner.invoke(cli, [str(tmpdir)])
421423

422424
assert result.exit_code == 0
425+
426+
427+
def test_glob_cache_is_fast(runner: CliRunner, tmpdir: Path) -> None:
428+
"""Test that glob caching improves performance."""
429+
# Create 1000 files in the tmpdir
430+
for i in range(1000):
431+
(tmpdir / f"dont_use_me_{i}.yaml").write_text(
432+
"program: yamlfix", encoding="utf-8"
433+
)
434+
435+
for i in range(1000):
436+
(tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8")
437+
438+
# Clear any existing cache
439+
from yamlfix.entrypoints.cli import _clear_glob_cache
440+
441+
_clear_glob_cache()
442+
443+
# Test with cache (second run should benefit from cache)
444+
start_time = time.time()
445+
result = runner.invoke(
446+
cli,
447+
[str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"],
448+
)
449+
end_time = time.time()
450+
cache_time = end_time - start_time
451+
452+
assert result.exit_code == 0
453+
assert (
454+
cache_time <= 1
455+
), f"Expected cache time to be less than 1 second, got {cache_time:.3f}s"
456+
457+
458+
@pytest.mark.slow
459+
def test_glob_cache_excludes_slow(runner: CliRunner, tmpdir: Path) -> None:
460+
"""Test that glob caching improves performance with multiple exclude patterns."""
461+
# Create 1000 files in the tmpdir
462+
for i in range(1000):
463+
(tmpdir / f"dont_use_me_{i}.yaml").write_text(
464+
"program: yamlfix", encoding="utf-8"
465+
)
466+
467+
for i in range(1000):
468+
(tmpdir / f"use_me_{i}.yaml").write_text("program: yamlfix", encoding="utf-8")
469+
470+
# Test without cache (mock to bypass cache)
471+
start_time = time.time()
472+
with mock.patch(
473+
"yamlfix.entrypoints.cli._glob_cache",
474+
side_effect=lambda dir_, glob: set(dir_.glob(glob)),
475+
):
476+
with mock.patch(
477+
"yamlfix.entrypoints.cli._rglob_cache",
478+
side_effect=lambda dir_, glob: set(dir_.rglob(glob)),
479+
):
480+
result1 = runner.invoke(
481+
cli,
482+
[
483+
str(tmpdir),
484+
"--exclude",
485+
"dont_*.yaml",
486+
"--exclude",
487+
"dont_use_me_*.yaml",
488+
],
489+
)
490+
end_time = time.time()
491+
no_cache_time = end_time - start_time
492+
493+
# Clear any existing cache
494+
from yamlfix.entrypoints.cli import _clear_glob_cache
495+
496+
_clear_glob_cache()
497+
498+
# Test with cache (second run should benefit from cache)
499+
start_time = time.time()
500+
result2 = runner.invoke(
501+
cli,
502+
[str(tmpdir), "--exclude", "dont_*.yaml", "--exclude", "dont_use_me_*.yaml"],
503+
)
504+
end_time = time.time()
505+
cache_time = end_time - start_time
506+
507+
# Cache should be faster (less time)
508+
assert cache_time <= no_cache_time * 0.5, (
509+
f"Cached version should be faster or comparable to non-cached. "
510+
f"Cache time: {cache_time:.3f}s, No cache time: {no_cache_time:.3f}s"
511+
)
512+
513+
assert result1.exit_code == 0
514+
assert result2.exit_code == 0

tests/unit/test_cli_cache.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""Unit tests for CLI glob caching functions."""
2+
3+
from pathlib import Path
4+
5+
import pytest
6+
7+
from yamlfix.entrypoints.cli import (
8+
_GLOB_CACHE,
9+
_clear_glob_cache,
10+
_glob_cache,
11+
_rglob_cache,
12+
)
13+
14+
15+
@pytest.fixture(autouse=True)
16+
def clear_cache():
17+
"""Clear the glob cache before and after each test."""
18+
_clear_glob_cache()
19+
yield
20+
_clear_glob_cache()
21+
22+
23+
@pytest.fixture
24+
def test_directory(tmp_path: Path) -> Path:
25+
"""Create a test directory structure with YAML files."""
26+
# Create files in root
27+
(tmp_path / "test.yaml").write_text("test")
28+
(tmp_path / "test.yml").write_text("test")
29+
(tmp_path / "other.txt").write_text("test")
30+
31+
# Create subdirectory with nested files
32+
subdir = tmp_path / "subdir"
33+
subdir.mkdir()
34+
(subdir / "nested.yaml").write_text("test")
35+
(subdir / "nested.yml").write_text("test")
36+
37+
return tmp_path
38+
39+
40+
class TestGlobCache:
41+
"""Test the _glob_cache function."""
42+
43+
def test_glob_cache_basic_functionality(self, test_directory: Path):
44+
"""Test that _glob_cache returns correct files and caches results."""
45+
result = _glob_cache(test_directory, "*.yaml")
46+
47+
# Should find files in current directory only (not recursive)
48+
assert test_directory / "test.yaml" in result
49+
assert test_directory / "subdir" / "nested.yaml" not in result
50+
51+
# Cache should be populated
52+
assert len(_GLOB_CACHE) == 1
53+
cache_key = (str(test_directory), "*.yaml", "g")
54+
assert cache_key in _GLOB_CACHE
55+
56+
def test_glob_cache_uses_cache_on_repeat_calls(self, test_directory: Path):
57+
"""Test that subsequent calls use cached results."""
58+
# First call
59+
result1 = _glob_cache(test_directory, "*.yaml")
60+
cache_size_after_first = len(_GLOB_CACHE)
61+
62+
# Second call should use cache
63+
result2 = _glob_cache(test_directory, "*.yaml")
64+
cache_size_after_second = len(_GLOB_CACHE)
65+
66+
assert result1 == result2
67+
assert cache_size_after_first == cache_size_after_second == 1
68+
69+
def test_glob_cache_different_patterns_create_separate_entries(
70+
self, test_directory: Path
71+
):
72+
"""Test that different glob patterns create separate cache entries."""
73+
result_yaml = _glob_cache(test_directory, "*.yaml")
74+
result_yml = _glob_cache(test_directory, "*.yml")
75+
76+
assert len(_GLOB_CACHE) == 2
77+
assert result_yaml != result_yml
78+
assert test_directory / "test.yaml" in result_yaml
79+
assert test_directory / "test.yml" in result_yml
80+
81+
def test_glob_cache_different_directories_create_separate_entries(
82+
self, test_directory: Path
83+
):
84+
"""Test that different directories create separate cache entries."""
85+
# Create another directory
86+
other_dir = test_directory / "other"
87+
other_dir.mkdir()
88+
(other_dir / "other.yaml").write_text("test")
89+
90+
result1 = _glob_cache(test_directory, "*.yaml")
91+
result2 = _glob_cache(other_dir, "*.yaml")
92+
93+
assert len(_GLOB_CACHE) == 2
94+
assert result1 != result2
95+
assert test_directory / "test.yaml" in result1
96+
assert other_dir / "other.yaml" in result2
97+
98+
99+
class TestRglobCache:
100+
"""Test the _rglob_cache function."""
101+
102+
def test_rglob_cache_basic_functionality(self, test_directory: Path):
103+
"""Test that _rglob_cache returns correct files recursively and caches results."""
104+
result = _rglob_cache(test_directory, "*.yaml")
105+
106+
# Should find files recursively
107+
assert test_directory / "test.yaml" in result
108+
assert test_directory / "subdir" / "nested.yaml" in result
109+
110+
# Cache should be populated with rglob key
111+
assert len(_GLOB_CACHE) == 1
112+
cache_key = (str(test_directory), "*.yaml", "r")
113+
assert cache_key in _GLOB_CACHE
114+
115+
def test_rglob_cache_vs_glob_cache_different_keys(self, test_directory: Path):
116+
"""Test that rglob and glob use different cache keys."""
117+
glob_result = _glob_cache(test_directory, "*.yaml")
118+
rglob_result = _rglob_cache(test_directory, "*.yaml")
119+
120+
# Should have separate cache entries
121+
assert len(_GLOB_CACHE) == 2
122+
123+
# Results should be different (rglob includes nested files)
124+
assert len(rglob_result) > len(glob_result)
125+
assert test_directory / "subdir" / "nested.yaml" in rglob_result
126+
assert test_directory / "subdir" / "nested.yaml" not in glob_result
127+
128+
def test_rglob_cache_uses_cache_on_repeat_calls(self, test_directory: Path):
129+
"""Test that subsequent rglob calls use cached results."""
130+
result1 = _rglob_cache(test_directory, "*.yaml")
131+
cache_size_after_first = len(_GLOB_CACHE)
132+
133+
result2 = _rglob_cache(test_directory, "*.yaml")
134+
cache_size_after_second = len(_GLOB_CACHE)
135+
136+
assert result1 == result2
137+
assert cache_size_after_first == cache_size_after_second == 1
138+
139+
140+
class TestCacheClear:
141+
"""Test the _clear_glob_cache function."""
142+
143+
def test_clear_glob_cache_empties_cache(self, test_directory: Path):
144+
"""Test that cache clearing removes all entries."""
145+
# Populate cache with several entries
146+
_glob_cache(test_directory, "*.yaml")
147+
_glob_cache(test_directory, "*.yml")
148+
_rglob_cache(test_directory, "*.yaml")
149+
150+
assert len(_GLOB_CACHE) == 3
151+
152+
# Clear cache
153+
_clear_glob_cache()
154+
155+
assert len(_GLOB_CACHE) == 0
156+
157+
def test_clear_glob_cache_allows_fresh_caching(self, test_directory: Path):
158+
"""Test that after clearing, caching works normally again."""
159+
# First round of caching
160+
_glob_cache(test_directory, "*.yaml")
161+
assert len(_GLOB_CACHE) == 1
162+
163+
# Clear and verify empty
164+
_clear_glob_cache()
165+
assert len(_GLOB_CACHE) == 0
166+
167+
# Second round should work normally
168+
_glob_cache(test_directory, "*.yaml")
169+
assert len(_GLOB_CACHE) == 1
170+
171+
172+
class TestCacheKeyConstruction:
173+
"""Test that cache keys are constructed correctly."""
174+
175+
def test_cache_key_includes_directory_path(self, test_directory: Path):
176+
"""Test that cache keys include the directory path."""
177+
_glob_cache(test_directory, "*.yaml")
178+
179+
expected_key = (str(test_directory), "*.yaml", "g")
180+
assert expected_key in _GLOB_CACHE
181+
182+
def test_cache_key_distinguishes_glob_vs_rglob(self, test_directory: Path):
183+
"""Test that glob and rglob operations have different key suffixes."""
184+
_glob_cache(test_directory, "*.yaml")
185+
_rglob_cache(test_directory, "*.yaml")
186+
187+
glob_key = (str(test_directory), "*.yaml", "g")
188+
rglob_key = (str(test_directory), "*.yaml", "r")
189+
190+
assert glob_key in _GLOB_CACHE
191+
assert rglob_key in _GLOB_CACHE
192+
assert len(_GLOB_CACHE) == 2

0 commit comments

Comments
 (0)