Skip to content

Commit 3897cfd

Browse files
authored
Unify behavior of source code search (#332)
* Add tests for walk_source_dir * Don't test path order: it will differ between Windows and Posix * Restore 3.9 compatible code. Add sort option. Update tests * Create source_code_search function * Use os functions only when necessary. Update comments.
1 parent 5d15bf6 commit 3897cfd

File tree

4 files changed

+281
-20
lines changed

4 files changed

+281
-20
lines changed

reccmp/compare/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing_extensions import Self
66
from reccmp.project.detect import RecCmpTarget
77
from reccmp.difflib import get_grouped_opcodes
8-
from reccmp.dir import walk_source_dir
8+
from reccmp.dir import source_code_search
99
from reccmp.compare.functions import FunctionComparator
1010
from reccmp.formats import (
1111
Image,
@@ -195,7 +195,7 @@ def from_target(cls, target: RecCmpTarget) -> Self:
195195
)
196196
pdb_file = CvdumpAnalysis(cvdump)
197197

198-
code_paths = walk_source_dir(target.source_root)
198+
code_paths = source_code_search(target.source_root)
199199
code_files = list(TextFile.from_files(code_paths, allow_error=True))
200200

201201
data_sources = list(TextFile.from_files(target.data_sources, allow_error=True))

reccmp/dir.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import os
44
import subprocess
5-
from typing import Iterator
5+
from typing import Iterable, Iterator
66
from pathlib import Path, PurePath, PureWindowsPath
77

88

@@ -134,8 +134,9 @@ def resolve_cvdump(self, path_str: str) -> str:
134134
return self._memo[path_str]
135135

136136

137-
def is_file_c_like(filename: Path | str) -> bool:
138-
return Path(filename).suffix.lower() in (
137+
def is_file_c_like(path: Path) -> bool:
138+
"""Tests the given path for typical C/C++ file extensions."""
139+
return path.suffix.lower() in (
139140
".c",
140141
".h",
141142
".cc",
@@ -144,18 +145,52 @@ def is_file_c_like(filename: Path | str) -> bool:
144145
".hxx",
145146
".cpp",
146147
".hpp",
147-
".C",
148148
)
149149

150150

151-
def walk_source_dir(source: Path, recursive: bool = True) -> Iterator[Path]:
152-
"""Generator to walk the given directory recursively and return
153-
any C++ files found."""
151+
def platform_independent_path_sort(paths: Iterable[Path]) -> Iterator[Path]:
152+
"""PosixPaths are case-sensitive. WindowsPaths are not.
153+
To ensure that reccmp processing is consistent across platforms,
154+
use this method to sort paths in a case-insensitive order.
155+
An example of the problem we want to fix: metadata for the same entity address
156+
in multiple files may cause us to overwrite data. If we do, it should be done
157+
in a consistent way because we have processed the metadata files in the same order.
158+
"""
159+
yield from sorted(paths, key=lambda p: str(p).lower())
160+
161+
162+
def walk_source_dir(source: Path, *, recursive: bool = True) -> Iterator[Path]:
163+
"""Returns any C/C++ source code files found in the given directory tree."""
154164

155-
for subdir, _, files in source.walk():
165+
# Python 3.12 introduced Path.walk(). We use os.walk() instead for broader compatibility.
166+
for subdir, _, files in os.walk(source.absolute()):
156167
for file in files:
157-
if is_file_c_like(file):
158-
yield subdir / file
168+
path = Path(os.path.join(subdir, file))
169+
if is_file_c_like(path):
170+
yield path
159171

160172
if not recursive:
161173
break
174+
175+
176+
def source_code_search(search_paths: Path | Iterable[Path]) -> Iterator[Path]:
177+
"""Use the provided search paths to find source code files in the project
178+
that we will scan for reccmp metadata. Returns a list of distinct paths
179+
sorted by their lower-case string representation."""
180+
code_files = set()
181+
182+
# Allow single Path argument
183+
if not isinstance(search_paths, Iterable):
184+
search_paths = (search_paths,)
185+
186+
for path in search_paths:
187+
if not path.exists():
188+
continue
189+
190+
if path.is_file() and is_file_c_like(path):
191+
code_files.add(path)
192+
continue
193+
194+
code_files.update(walk_source_dir(path))
195+
196+
yield from platform_independent_path_sort(code_files)

reccmp/tools/decomplint.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Iterable
77
import colorama
88
import reccmp
9-
from reccmp.dir import walk_source_dir
9+
from reccmp.dir import source_code_search
1010
from reccmp.parser import DecompLinter
1111
from reccmp.parser.error import ParserAlert
1212
from reccmp.project.logging import argparse_add_logging_args, argparse_parse_logging
@@ -108,7 +108,7 @@ def process_files(
108108

109109
def main():
110110
args = parse_args()
111-
files_to_check: list[Path] = []
111+
search_paths: list[Path] = []
112112

113113
if not args.paths:
114114
# No path specified. Try to find the project file.
@@ -121,18 +121,16 @@ def main():
121121
# then get all filenames from each code directory.
122122
for target in project.targets.values():
123123
if target.source_root:
124-
files_to_check.extend(walk_source_dir(target.source_root))
124+
search_paths.append(target.source_root)
125125
else:
126126
for path in args.paths:
127-
if path.is_dir():
128-
files_to_check.extend(walk_source_dir(path))
129-
elif path.is_file():
130-
files_to_check.append(path)
127+
if path.is_dir() or path.is_file():
128+
search_paths.append(path)
131129
else:
132130
logger.error("Invalid path: %s", path)
133131

134132
# Use set() so we check each file only once.
135-
reduced_file_list = sorted(set(files_to_check))
133+
reduced_file_list = source_code_search(search_paths)
136134

137135
codefiles = list(TextFile.from_files(reduced_file_list, allow_error=True))
138136

tests/test_dir.py

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
"""Tests for walking source code directories in search of relevant files"""
2+
3+
from pathlib import Path
4+
from reccmp.dir import source_code_search, walk_source_dir
5+
6+
7+
def create_blank_file(p: Path):
8+
"""Helper to make this action more idiomatic in the tests."""
9+
p.write_text("")
10+
11+
12+
def test_walk_empty_dir(tmp_path_factory):
13+
"""Empty directory returns no files."""
14+
path = tmp_path_factory.mktemp("empty")
15+
16+
files = list(walk_source_dir(path))
17+
assert not files
18+
19+
20+
def test_walk_dir_with_irrelevant_files(tmp_path_factory):
21+
"""Skip files that do not match our default filter on file extension."""
22+
path = tmp_path_factory.mktemp("no_match")
23+
create_blank_file(path / "test.txt")
24+
create_blank_file(path / "sample.jpg")
25+
26+
files = list(walk_source_dir(path))
27+
assert not files
28+
29+
30+
def test_walk_dir_with_mixed_files(tmp_path_factory):
31+
"""Return code files and skip non-matching files."""
32+
path = tmp_path_factory.mktemp("mixed")
33+
create_blank_file(path / "test.txt")
34+
create_blank_file(path / "sample.jpg")
35+
create_blank_file(path / "game.cpp")
36+
create_blank_file(path / "game.hpp")
37+
38+
files = list(walk_source_dir(path))
39+
assert len(files) == 2
40+
assert {f.name for f in files} == {"game.cpp", "game.hpp"}
41+
42+
43+
def test_walk_non_directory(tmp_path_factory):
44+
"""Return nothing if the input path is not a directory."""
45+
path = tmp_path_factory.mktemp("mixed")
46+
create_blank_file(path / "test.txt")
47+
create_blank_file(path / "sample.jpg")
48+
create_blank_file(path / "game.cpp")
49+
create_blank_file(path / "game.hpp")
50+
51+
files = list(walk_source_dir(path / "game.cpp"))
52+
assert not files
53+
54+
55+
def test_walk_recursion(tmp_path_factory):
56+
"""Should scan every subdirectory for matching files."""
57+
path = tmp_path_factory.mktemp("recurse")
58+
create_blank_file(path / "game.cpp")
59+
(path / "x" / "y" / "z").mkdir(parents=True)
60+
create_blank_file(path / "x" / "hello.cpp")
61+
create_blank_file(path / "x" / "y" / "z" / "test.cpp")
62+
63+
files = list(walk_source_dir(path))
64+
assert {f.name for f in files} == {"game.cpp", "hello.cpp", "test.cpp"}
65+
66+
67+
def test_walk_recursion_disabled(tmp_path_factory):
68+
"""Should not enter subdirectories if recursion is disabled."""
69+
path = tmp_path_factory.mktemp("recurse")
70+
create_blank_file(path / "game.cpp")
71+
(path / "x" / "y" / "z").mkdir(parents=True)
72+
create_blank_file(path / "x" / "hello.cpp")
73+
create_blank_file(path / "x" / "y" / "z" / "test.cpp")
74+
75+
files = list(walk_source_dir(path, recursive=False))
76+
assert {f.name for f in files} == {"game.cpp"}
77+
78+
79+
def test_walk_absolute_paths(tmp_path_factory):
80+
"""Returned paths should be absolute and include all subdirectories."""
81+
path = tmp_path_factory.mktemp("recurse")
82+
create_blank_file(path / "game.cpp")
83+
(path / "x" / "y" / "z").mkdir(parents=True)
84+
create_blank_file(path / "x" / "hello.cpp")
85+
create_blank_file(path / "x" / "y" / "z" / "test.cpp")
86+
87+
files = list(walk_source_dir(path))
88+
for f in files:
89+
assert f.is_absolute()
90+
91+
# For files in subdirectories, make sure each one is represented.
92+
# Check only the components relative to the base path.
93+
# We don't control where pytest creates the tmp directory.
94+
if f.name == "hello.cpp":
95+
assert "x" in f.relative_to(path).parts
96+
97+
if f.name == "test.cpp":
98+
assert "x" in f.relative_to(path).parts
99+
assert "y" in f.relative_to(path).parts
100+
assert "z" in f.relative_to(path).parts
101+
102+
103+
def test_walk_mixed_case_extension(tmp_path_factory):
104+
"""Should use case-insensitive match for file extensions."""
105+
path = tmp_path_factory.mktemp("mixed_case")
106+
create_blank_file(path / "game.CPP")
107+
create_blank_file(path / "HELLO.C")
108+
create_blank_file(path / "HELLO.h")
109+
create_blank_file(path / "test.hpP")
110+
111+
files = list(walk_source_dir(path))
112+
assert {f.name.lower() for f in files} == {
113+
"game.cpp",
114+
"hello.c",
115+
"hello.h",
116+
"test.hpp",
117+
}
118+
119+
120+
def test_search_mixed_files(tmp_path_factory):
121+
"""Search behaves the same as walk_source_dir: return only matching code files."""
122+
path = tmp_path_factory.mktemp("mixed")
123+
create_blank_file(path / "test.txt")
124+
create_blank_file(path / "sample.jpg")
125+
create_blank_file(path / "game.cpp")
126+
create_blank_file(path / "game.hpp")
127+
128+
files = list(source_code_search(path))
129+
assert {f.name for f in files} == {"game.cpp", "game.hpp"}
130+
131+
132+
def test_search_ignore_missing_dir(tmp_path_factory):
133+
"""Search should return nothing if the search path does not exist."""
134+
path = tmp_path_factory.mktemp("mixed")
135+
create_blank_file(path / "test.txt")
136+
create_blank_file(path / "sample.jpg")
137+
create_blank_file(path / "game.cpp")
138+
create_blank_file(path / "game.hpp")
139+
140+
files = list(source_code_search(path / "x"))
141+
assert not files
142+
143+
144+
def test_search_input_order_ignored(tmp_path_factory):
145+
"""The returned paths are always sorted by their lower-case string version,
146+
regardless of input order."""
147+
path = tmp_path_factory.mktemp("order")
148+
create_blank_file(path / "game.cpp")
149+
create_blank_file(path / "Test.cpp")
150+
151+
search_paths = (path / "Test.cpp", path / "game.cpp")
152+
files = list(source_code_search(search_paths))
153+
assert [f.name for f in files] == ["game.cpp", "Test.cpp"]
154+
155+
156+
def test_search_ignore_irrelevant_file_even_if_targeted(tmp_path_factory):
157+
"""The current behavior is to test each file's extension.
158+
This is true even if the input search path points directly at a file."""
159+
path = tmp_path_factory.mktemp("mixed")
160+
create_blank_file(path / "test.txt")
161+
create_blank_file(path / "sample.jpg")
162+
create_blank_file(path / "game.cpp")
163+
create_blank_file(path / "game.hpp")
164+
165+
search_paths = path / "test.text"
166+
files = list(source_code_search(search_paths))
167+
assert not files
168+
169+
170+
def test_search_nested_dirs(tmp_path_factory):
171+
"""Should return distinct files if any search paths overlap."""
172+
path = tmp_path_factory.mktemp("recurse")
173+
create_blank_file(path / "game.cpp")
174+
(path / "x" / "y" / "z").mkdir(parents=True)
175+
create_blank_file(path / "x" / "hello.cpp")
176+
create_blank_file(path / "x" / "y" / "z" / "test.cpp")
177+
178+
# Nested paths
179+
search_paths = (path, path / "x")
180+
files = list(source_code_search(search_paths))
181+
assert len(files) == 3
182+
183+
184+
def test_search_using_file(tmp_path_factory):
185+
"""Using a file as our search path, the search function
186+
should return the path if the file exists and matches our criteria."""
187+
path = tmp_path_factory.mktemp("mixed")
188+
create_blank_file(path / "test.txt")
189+
create_blank_file(path / "sample.jpg")
190+
create_blank_file(path / "game.cpp")
191+
create_blank_file(path / "game.hpp")
192+
193+
files = list(source_code_search(path / "game.cpp"))
194+
assert [f.name for f in files] == ["game.cpp"]
195+
196+
197+
def test_search_using_file_not_exist(tmp_path_factory):
198+
"""Search should return nothing if the search path does not exist.
199+
Using a file in this example."""
200+
path = tmp_path_factory.mktemp("mixed")
201+
create_blank_file(path / "test.txt")
202+
create_blank_file(path / "sample.jpg")
203+
create_blank_file(path / "game.cpp")
204+
create_blank_file(path / "game.hpp")
205+
206+
files = list(source_code_search(path / "hello.cpp"))
207+
assert not files
208+
209+
210+
def test_search_case_insensitive_order(tmp_path_factory):
211+
"""Make sure path order is consistent across Posix/Windows runners.
212+
The difference is that paths are case-sensitive on Posix but not Windows.
213+
The expectation is that reccmp returns the same results regardless of platform.
214+
Metadata read from source code files could be overwritten if it is duplicated across files.
215+
"""
216+
path = tmp_path_factory.mktemp("mixed_case")
217+
create_blank_file(path / "game.CPP")
218+
create_blank_file(path / "HELLO.C")
219+
create_blank_file(path / "HELLO.h")
220+
create_blank_file(path / "test.hpP")
221+
222+
files = list(source_code_search(path))
223+
assert [f.name.lower() for f in files] == [
224+
"game.cpp",
225+
"hello.c",
226+
"hello.h",
227+
"test.hpp",
228+
]

0 commit comments

Comments
 (0)