Skip to content

Commit 1c9c123

Browse files
authored
Abstract filesystem operations in file_utils.py (#179)
* Abstract filesystem operations in file_utils.py - Add fs parameter to walk_with_gitignore and get_files functions - Replace direct filesystem calls with Fs interface methods - Use fs.stat(), fs.readlines(), and fs.list() instead of os.scandir() and open() - Exclude .gitignore files from walk results - Add comprehensive tests using RecordFs for deterministic testing - Test gitignore filtering, skip functions, and edge cases * Address PR feedback: use fully qualified imports, absolute paths, and add nested gitignore test - Use fully qualified imports in file_utils.py and file_utils_test.py - Convert all test paths to absolute paths as requested - Add comprehensive nested gitignore test case - Improve RecordFs.list() method to handle intermediate directories properly
1 parent f3aa1d1 commit 1c9c123

File tree

3 files changed

+222
-13
lines changed

3 files changed

+222
-13
lines changed

aws_doc_sdk_examples_tools/file_utils.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from shutil import rmtree
99

1010
from pathspec import GitIgnoreSpec
11+
from aws_doc_sdk_examples_tools.fs import Fs, PathFs
1112

1213

1314
def match_path_to_specs(path: Path, specs: List[GitIgnoreSpec]) -> bool:
@@ -21,7 +22,7 @@ def match_path_to_specs(path: Path, specs: List[GitIgnoreSpec]) -> bool:
2122

2223

2324
def walk_with_gitignore(
24-
root: Path, specs: List[GitIgnoreSpec] = []
25+
root: Path, specs: List[GitIgnoreSpec] = [], fs: Fs = PathFs()
2526
) -> Generator[Path, None, None]:
2627
"""
2728
Starting from a root directory, walk the file system yielding a path for each file.
@@ -30,27 +31,31 @@ def walk_with_gitignore(
3031
fiddling with a number of flags.
3132
"""
3233
gitignore = root / ".gitignore"
33-
if gitignore.exists():
34-
with open(root / ".gitignore", "r", encoding="utf-8") as ignore_file:
35-
specs = [*specs, GitIgnoreSpec.from_lines(ignore_file.readlines())]
36-
for entry in os.scandir(root):
37-
path = Path(entry.path)
34+
gitignore_stat = fs.stat(gitignore)
35+
if gitignore_stat.exists:
36+
lines = fs.readlines(gitignore)
37+
specs = [*specs, GitIgnoreSpec.from_lines(lines)]
38+
39+
for path in fs.list(root):
3840
if not match_path_to_specs(path, specs):
39-
if entry.is_dir():
40-
yield from walk_with_gitignore(path, specs)
41+
path_stat = fs.stat(path)
42+
if path_stat.is_dir:
43+
yield from walk_with_gitignore(path, specs, fs)
4144
else:
42-
yield path
45+
# Don't yield .gitignore files themselves
46+
if path.name != ".gitignore":
47+
yield path
4348

4449

4550
def get_files(
46-
root: Path, skip: Callable[[Path], bool] = lambda _: False
51+
root: Path, skip: Callable[[Path], bool] = lambda _: False, fs: Fs = PathFs()
4752
) -> Generator[Path, None, None]:
4853
"""
4954
Yield non-skipped files, that is, anything not matching git ls-files and not
5055
in the "to skip" files that are in git but are machine generated, so we don't
5156
want to validate them.
5257
"""
53-
for path in walk_with_gitignore(root):
58+
for path in walk_with_gitignore(root, fs=fs):
5459
if not skip(path):
5560
yield path
5661

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Tests for file_utils.py with filesystem abstraction.
6+
"""
7+
8+
from pathlib import Path
9+
10+
from aws_doc_sdk_examples_tools.fs import RecordFs
11+
from aws_doc_sdk_examples_tools.file_utils import walk_with_gitignore, get_files
12+
13+
14+
class TestWalkWithGitignore:
15+
"""Test walk_with_gitignore with RecordFs."""
16+
17+
def test_basic_directory_traversal(self):
18+
"""Test basic directory traversal without gitignore."""
19+
fs = RecordFs(
20+
{
21+
Path("/root/file1.py"): "print('file1')",
22+
Path("/root/file2.js"): "console.log('file2')",
23+
}
24+
)
25+
26+
files = list(walk_with_gitignore(Path("/root"), fs=fs))
27+
28+
expected = [
29+
Path("/root/file1.py"),
30+
Path("/root/file2.js"),
31+
]
32+
assert sorted(files) == sorted(expected)
33+
34+
def test_gitignore_filtering(self):
35+
"""Test that gitignore rules are applied correctly."""
36+
fs = RecordFs(
37+
{
38+
Path("/root/.gitignore"): "*.tmp\n*.log\n",
39+
Path("/root/keep.py"): "print('keep')",
40+
Path("/root/ignore.tmp"): "temporary",
41+
Path("/root/keep.js"): "console.log('keep')",
42+
Path("/root/debug.log"): "log content",
43+
}
44+
)
45+
46+
files = list(walk_with_gitignore(Path("/root"), fs=fs))
47+
48+
# .gitignore files should not be included in results
49+
expected = [
50+
Path("/root/keep.py"),
51+
Path("/root/keep.js"),
52+
]
53+
assert sorted(files) == sorted(expected)
54+
55+
def test_no_gitignore_file(self):
56+
"""Test directory traversal when no .gitignore exists."""
57+
fs = RecordFs(
58+
{
59+
Path("/root/file1.py"): "print('file1')",
60+
Path("/root/file2.js"): "console.log('file2')",
61+
Path("/root/file3.txt"): "text content",
62+
}
63+
)
64+
65+
files = list(walk_with_gitignore(Path("/root"), fs=fs))
66+
67+
expected = [
68+
Path("/root/file1.py"),
69+
Path("/root/file2.js"),
70+
Path("/root/file3.txt"),
71+
]
72+
assert sorted(files) == sorted(expected)
73+
74+
def test_empty_directory(self):
75+
"""Test walking an empty directory."""
76+
fs = RecordFs({})
77+
78+
files = list(walk_with_gitignore(Path("/empty"), fs=fs))
79+
80+
assert files == []
81+
82+
def test_directory_with_only_gitignore(self):
83+
"""Test directory that only contains .gitignore file."""
84+
fs = RecordFs(
85+
{
86+
Path("/root/.gitignore"): "*.tmp\n",
87+
}
88+
)
89+
90+
files = list(walk_with_gitignore(Path("/root"), fs=fs))
91+
92+
assert files == []
93+
94+
def test_nested_gitignores(self):
95+
"""Test nested gitignore files with different rules."""
96+
fs = RecordFs(
97+
{
98+
# Root level gitignore ignores *.log files
99+
Path("/root/.gitignore"): "*.log\n",
100+
Path("/root/keep.py"): "print('keep')",
101+
Path("/root/debug.log"): "root log", # Should be ignored
102+
# Nested directory with its own gitignore ignoring *.tmp files
103+
Path("/root/subdir/.gitignore"): "*.tmp\n",
104+
Path("/root/subdir/keep.js"): "console.log('keep')",
105+
Path(
106+
"/root/subdir/ignore.tmp"
107+
): "temporary", # Should be ignored by subdir gitignore
108+
Path(
109+
"/root/subdir/keep.log"
110+
): "nested log", # Should be ignored by root gitignore
111+
}
112+
)
113+
114+
files = list(walk_with_gitignore(Path("/root"), fs=fs))
115+
116+
# Only files that don't match any gitignore pattern should be returned
117+
expected = [
118+
Path("/root/keep.py"),
119+
Path("/root/subdir/keep.js"),
120+
]
121+
assert sorted(files) == sorted(expected)
122+
123+
124+
class TestGetFiles:
125+
"""Test get_files with RecordFs."""
126+
127+
def test_get_files_basic(self):
128+
"""Test basic get_files functionality."""
129+
fs = RecordFs(
130+
{
131+
Path("/root/file1.py"): "print('file1')",
132+
Path("/root/file2.js"): "console.log('file2')",
133+
}
134+
)
135+
136+
files = list(get_files(Path("/root"), fs=fs))
137+
138+
expected = [
139+
Path("/root/file1.py"),
140+
Path("/root/file2.js"),
141+
]
142+
assert sorted(files) == sorted(expected)
143+
144+
def test_get_files_with_skip_function(self):
145+
"""Test get_files with skip function."""
146+
fs = RecordFs(
147+
{
148+
Path("/root/keep.py"): "print('keep')",
149+
Path("/root/skip.py"): "print('skip')",
150+
Path("/root/keep.js"): "console.log('keep')",
151+
Path("/root/skip.js"): "console.log('skip')",
152+
}
153+
)
154+
155+
def skip_function(path: Path) -> bool:
156+
return "skip" in path.name
157+
158+
files = list(get_files(Path("/root"), skip=skip_function, fs=fs))
159+
160+
expected = [
161+
Path("/root/keep.py"),
162+
Path("/root/keep.js"),
163+
]
164+
assert sorted(files) == sorted(expected)
165+
166+
def test_get_files_with_gitignore_and_skip(self):
167+
"""Test get_files with both gitignore and skip function."""
168+
fs = RecordFs(
169+
{
170+
Path("/root/.gitignore"): "*.tmp\n",
171+
Path("/root/keep.py"): "print('keep')",
172+
Path("/root/skip.py"): "print('skip')",
173+
Path("/root/ignore.tmp"): "temporary",
174+
Path("/root/keep.js"): "console.log('keep')",
175+
}
176+
)
177+
178+
def skip_function(path: Path) -> bool:
179+
return "skip" in path.name
180+
181+
files = list(get_files(Path("/root"), skip=skip_function, fs=fs))
182+
183+
expected = [
184+
Path("/root/keep.py"),
185+
Path("/root/keep.js"),
186+
]
187+
assert sorted(files) == sorted(expected)

aws_doc_sdk_examples_tools/fs.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def glob(self, path: Path, glob: str) -> Generator[Path, None, None]:
5353
return path.glob(glob)
5454

5555
def read(self, path: Path) -> str:
56-
with path.open("r") as file:
56+
with path.open("r", encoding="utf-8") as file:
5757
return file.read()
5858

5959
def readlines(self, path: Path) -> List[str]:
@@ -118,7 +118,24 @@ def mkdir(self, path: Path):
118118
self.fs.setdefault(path, "")
119119

120120
def list(self, path: Path) -> List[Path]:
121-
return [item for item in self.fs.keys() if item.parent == path]
121+
# If it's a file, return an empty list
122+
if self.stat(path).is_file:
123+
return []
124+
125+
# Gather all entries that are immediate children of `path`
126+
prefix = str(path).rstrip("/") + "/"
127+
children = set()
128+
129+
for item in self.fs.keys():
130+
item_s = str(item)
131+
if item_s.startswith(prefix):
132+
# Determine the remainder path after the prefix
133+
remainder = item_s[len(prefix) :]
134+
# Split off the first component
135+
first_part = remainder.split("/", 1)[0]
136+
children.add(Path(prefix + first_part))
137+
138+
return sorted(children)
122139

123140

124141
fs = PathFs()

0 commit comments

Comments
 (0)