Skip to content

Commit cdbcc95

Browse files
authored
Merge pull request #15 from trailofbits/subdirectories
Adds support for specifying subdirectories for comparison
2 parents c38d35d + 2de95b9 commit cdbcc95

File tree

6 files changed

+195
-61
lines changed

6 files changed

+195
-61
lines changed

README.md

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ Vendetect helps identify copied or vendored code between repositories, making it
1414

1515
Key features:
1616
- Compare code between two repositories (local or remote)
17+
- Analyze specific subdirectories within repositories
1718
- Identify files with similar code and display them side-by-side
1819
- Show similarity percentages for matched code
20+
- Filter by file types and adjust similarity thresholds
1921
- Support for different programming languages through Pygments lexers
2022
- Similarity is _not_ solely based upon symbol names; vendetect also considers semantics
2123

@@ -66,23 +68,84 @@ Where:
6668
- `TEST_REPO`: Path or URL to the repository you want to check for copied code
6769
- `SOURCE_REPO`: Path or URL to the repository that is the potential source of the code
6870

69-
### Example
71+
### Examples
7072

7173
```bash
7274
# Compare two local repositories
7375
vendetect /path/to/my/project /path/to/another/project
7476

7577
# Compare a local project with a remote repository
7678
vendetect /path/to/my/project https://github.com/example/repo.git
79+
80+
# Compare only specific subdirectories within repositories
81+
vendetect /path/to/my/project https://github.com/example/repo.git \
82+
--test-subdir src/components \
83+
--source-subdir lib/ui
84+
85+
# Filter by file types and adjust similarity threshold
86+
vendetect /path/to/my/project /path/to/another/project \
87+
--type py --type js \
88+
--min-similarity 0.8
7789
```
7890

7991
### Options
8092

8193
```
82-
--format FORMAT Output format: rich, csv, or json (default=rich)
83-
--log-level LEVEL Sets the log level (default=INFO)
84-
--debug Equivalent to --log-level=DEBUG
85-
--quiet Equivalent to --log-level=CRITICAL
94+
--format FORMAT Output format: rich, csv, or json (default=rich)
95+
--output OUTPUT Output file path (default: stdout)
96+
--force Force overwrite of existing output file
97+
--type FILE_TYPES, -t File extension to consider (can be used multiple times)
98+
--min-similarity THRESHOLD Minimum similarity threshold (range: 0.0-1.0, default: 0.5)
99+
--test-subdir DIR, -ts Subdirectory within TEST_REPO to analyze
100+
--source-subdir DIR, -ss Subdirectory within SOURCE_REPO to analyze
101+
--incremental Enable incremental result reporting
102+
--batch-size SIZE Number of files to process per batch (default: 100)
103+
--max-history-depth DEPTH Maximum commit history depth (default: -1 = entire history)
104+
--log-level LEVEL Sets the log level (default=INFO)
105+
--debug Equivalent to --log-level=DEBUG
106+
--quiet Equivalent to --log-level=CRITICAL
107+
```
108+
109+
### Advanced Features
110+
111+
#### Subdirectory Analysis
112+
When working with large repositories, you can focus analysis on specific subdirectories:
113+
114+
```bash
115+
# Analyze only the src/ directory in both repositories
116+
vendetect /path/to/my/project /path/to/another/project \
117+
--test-subdir src --source-subdir src
118+
119+
# Compare frontend code in one repo with backend in another
120+
vendetect /path/to/frontend-repo /path/to/backend-repo \
121+
--test-subdir client/src --source-subdir server/utils
122+
```
123+
124+
This is particularly useful for:
125+
- Focusing on relevant code sections
126+
- Reducing analysis time for large repositories
127+
- Comparing similar modules across different project structures
128+
129+
#### File Type Filtering
130+
Control which files are analyzed by specifying file extensions:
131+
132+
```bash
133+
# Only analyze Python files
134+
vendetect /path/to/my/project /path/to/another/project --type py
135+
136+
# Analyze multiple file types
137+
vendetect /path/to/my/project /path/to/another/project --type py --type js --type ts
138+
```
139+
140+
#### Similarity Thresholds
141+
Adjust the minimum similarity threshold to filter results:
142+
143+
```bash
144+
# Show only high-confidence matches (80% similarity or higher)
145+
vendetect /path/to/my/project /path/to/another/project --min-similarity 0.8
146+
147+
# Show all potential matches (lower threshold)
148+
vendetect /path/to/my/project /path/to/another/project --min-similarity 0.3
86149
```
87150

88151
### Output Formats
@@ -95,12 +158,12 @@ Vendetect supports three output formats:
95158

96159
Example using CSV output:
97160
```bash
98-
vendetect /path/to/my/project /path/to/another/project --format csv > results.csv
161+
vendetect /path/to/my/project /path/to/another/project --format csv --output results.csv
99162
```
100163

101164
Example using JSON output:
102165
```bash
103-
vendetect /path/to/my/project /path/to/another/project --format json > results.json
166+
vendetect /path/to/my/project /path/to/another/project --format json --output results.json
104167
```
105168

106169
## How it works 🧐

src/vendetect/_cli.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,24 @@ def read_file_content(file: File) -> str:
249249
def main() -> None: # noqa: C901, PLR0912, PLR0915
250250
parser = argparse.ArgumentParser(prog="vendetect")
251251

252-
parser.add_argument("TEST_REPO", type=str, help="path to the test repository")
253-
parser.add_argument("SOURCE_REPO", type=str, help="path to the source repository")
252+
parser.add_argument("TEST_REPO", type=str, help="path or URL to the test repository")
253+
parser.add_argument("SOURCE_REPO", type=str, help="path or URL to the source repository")
254+
parser.add_argument(
255+
"--test-subdir",
256+
"-ts",
257+
type=str,
258+
default=None,
259+
help="relative path to the subdirectory in TEST_REPO in which to test (optional, for use when TEST_REPO is a "
260+
"remote git repository URL)",
261+
)
262+
parser.add_argument(
263+
"--source-subdir",
264+
"-ss",
265+
type=str,
266+
default=None,
267+
help="relative path to the subdirectory in SOURCE_REPO in which to test (optional, for use when SOURCE_REPO is "
268+
"a remote git repository URL)",
269+
)
254270
parser.add_argument(
255271
"--format",
256272
type=str,
@@ -348,8 +364,8 @@ def main() -> None: # noqa: C901, PLR0912, PLR0915
348364

349365
try:
350366
with (
351-
Repository.load(args.TEST_REPO) as test_repo,
352-
Repository.load(args.SOURCE_REPO) as source_repo,
367+
Repository.load(args.TEST_REPO, args.test_subdir) as test_repo,
368+
Repository.load(args.SOURCE_REPO, args.source_subdir) as source_repo,
353369
RichStatus(console) as status,
354370
):
355371
# Initialize detector with optimization options

src/vendetect/comparison.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,14 @@ class Comparison:
6363
slices2: tuple[Slice, ...]
6464

6565
def __lt__(self, other: "Comparison") -> bool:
66-
if self.token_overlap > other.token_overlap:
67-
return True
68-
if self.token_overlap < other.token_overlap:
69-
return False
66+
# TODO(evan.sultanik@trailofbits.com): Make the comparison metric user-specifiable (#14) # noqa: FIX002
67+
# For now, disable comparison of token overlap, because that was causing too many false-positives
68+
# (I believe due to whitespace overlap)
69+
#
70+
# if self.token_overlap > other.token_overlap:
71+
# return True # noqa: ERA001
72+
# if self.token_overlap < other.token_overlap:
73+
# return False # noqa: ERA001
7074
oursim = self.similarity1 + self.similarity2
7175
theirsim = other.similarity1 + other.similarity2
7276
return oursim > theirsim

src/vendetect/detector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def update_compare_progress(self, file: File | None = None) -> None:
4444

4545

4646
@dataclass(frozen=True, unsafe_hash=True)
47-
class Source:
47+
class Source: # noqa: PLW1641 to fix a false-positive from ruff
4848
file: File
4949
source_slices: tuple[Slice, ...]
5050

src/vendetect/repo.py

Lines changed: 72 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,25 +33,35 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
3333

3434

3535
class Repository:
36-
def __init__(self, root_path: Path, rev: str | None = None):
36+
def __init__(self, root_path: Path, rev: str | None = None, subdir: Path | None = None):
3737
self.root_path: Path = root_path
38+
if subdir is not None and subdir.is_absolute():
39+
subdir = subdir.relative_to(root_path)
40+
self.subdir: Path | None = subdir
3841
self.rev: str = ""
3942
if rev is None:
4043
with self:
4144
if self.is_git:
4245
git_path: str = GIT_PATH # type: ignore
4346
self.rev = (
44-
subprocess.check_output([git_path, "rev-parse", "HEAD"], cwd=self.root_path) # noqa: S603
47+
subprocess.check_output([git_path, "rev-parse", "HEAD"], cwd=self.path) # noqa: S603
4548
.strip()
4649
.decode("utf-8")
4750
)
4851
else:
4952
self.rev = rev
5053

54+
@property
55+
@with_self
56+
def path(self) -> Path:
57+
if self.subdir is None:
58+
return self.root_path
59+
return self.root_path / self.subdir
60+
5161
def __hash__(self) -> int:
5262
if self.rev:
5363
return hash(self.rev)
54-
return hash(self.root_path)
64+
return hash(self.path)
5565

5666
def __eq__(self, other: object) -> bool:
5767
if not isinstance(other, Repository):
@@ -60,7 +70,7 @@ def __eq__(self, other: object) -> bool:
6070
return self.rev == other.rev
6171
if self.rev or other.rev:
6272
return False
63-
return self.root_path == other.root_path
73+
return self.path == other.path
6474

6575
def __enter__(self) -> Self:
6676
return self
@@ -87,7 +97,7 @@ def previous_version(self, path: Path) -> Optional["RepositoryCommit"]:
8797
)
8898
raise RepositoryError(msg)
8999
if path.is_absolute():
90-
path = path.relative_to(self.root_path)
100+
path = path.relative_to(self.path)
91101
prev_version = (
92102
subprocess.check_output( # noqa: S603
93103
[
@@ -102,7 +112,7 @@ def previous_version(self, path: Path) -> Optional["RepositoryCommit"]:
102112
"--",
103113
str(path),
104114
],
105-
cwd=self.root_path,
115+
cwd=self.path,
106116
)
107117
.decode("utf-8")
108118
.strip()
@@ -121,34 +131,70 @@ def is_shallow_clone(self) -> bool:
121131
return (
122132
subprocess.check_output( # noqa: S603
123133
[GIT_PATH, "rev-parse", "--is-shallow-repository"], # type: ignore
124-
cwd=self.root_path,
134+
cwd=self.path,
125135
stderr=subprocess.DEVNULL,
126136
).strip()
127137
!= b"false"
128138
)
129139
except subprocess.CalledProcessError:
130140
return False
131141

142+
@property
143+
@with_self
144+
def git_root(self) -> Path | None:
145+
if GIT_PATH is None:
146+
return None
147+
try:
148+
return Path(
149+
subprocess.check_output( # noqa: S603
150+
[GIT_PATH, "-C", str(self.path), "rev-parse", "--show-toplevel"],
151+
stderr=subprocess.DEVNULL,
152+
)
153+
.strip()
154+
.decode("utf-8")
155+
)
156+
except subprocess.CalledProcessError:
157+
return None
158+
159+
@with_self
160+
def is_inside_git_work_tree(self) -> bool:
161+
if GIT_PATH is None:
162+
return False
163+
try:
164+
return (
165+
subprocess.check_output( # noqa: S603
166+
[GIT_PATH, "-C", str(self.path), "rev-parse", "--is-inside-work-tree"],
167+
stderr=subprocess.DEVNULL,
168+
)
169+
.strip()
170+
.lower()
171+
== b"true"
172+
)
173+
except subprocess.CalledProcessError:
174+
return False
175+
132176
@property
133177
@with_self
134178
def is_git(self) -> bool:
135-
return self.root_path.is_dir() and (self.root_path / ".git").is_dir()
179+
return self.root_path.is_dir() and ((self.root_path / ".git").is_dir() or self.is_inside_git_work_tree())
136180

137181
@with_self
138182
def git_files(self) -> Iterator["File"]:
139183
if GIT_PATH is None:
140184
msg = "`git` binary could not be found"
141185
raise RepositoryError(msg)
142-
for line in subprocess.check_output([GIT_PATH, "ls-files"], cwd=self.root_path).splitlines(): # noqa: S603
186+
for line in subprocess.check_output([GIT_PATH, "ls-files"], cwd=self.path).splitlines(): # noqa: S603
143187
line = line.strip() # noqa: PLW2901
144188
if line:
145189
path = Path(line.decode("utf-8"))
190+
if self.subdir is not None:
191+
path = self.subdir / path
146192
yield File(path, self)
147193

148194
@with_self
149195
def files(self) -> Iterator["File"]:
150196
if GIT_PATH is None or not self.is_git:
151-
stack: list[File] = [File(self.root_path, self)]
197+
stack: list[File] = [File(self.path, self)]
152198
else:
153199
stack = list(reversed(list(self.git_files())))
154200
history = set()
@@ -166,33 +212,38 @@ def __iter__(self) -> Iterator["File"]:
166212
yield from self.files()
167213

168214
def __repr__(self) -> str:
169-
return f"{self.__class__.__name__}({self.root_path!r})"
215+
return f"{self.__class__.__name__}({self.root_path!r}, rev={self.rev!r}, subdir={self.subdir!r})"
170216

171217
def __str__(self) -> str:
172218
if self.rev:
173-
return f"{self.root_path!s}@{self.rev}"
174-
return f"{self.root_path!s}"
219+
return f"{self.path!s}@{self.rev}"
220+
return f"{self.path!s}"
175221

176222
@classmethod
177-
def load(cls, repo_uri: str) -> "Repository":
223+
def load(cls, repo_uri: str, subdir: Path | str | None = None) -> "Repository":
178224
# first see if it is a local repo
179225
repo_uri_path = Path(repo_uri).absolute()
226+
if subdir is not None and not isinstance(subdir, Path):
227+
subdir = Path(subdir)
180228
if repo_uri_path.exists() and repo_uri_path.is_dir():
181-
return Repository(repo_uri_path)
182-
return RemoteGitRepository(repo_uri)
229+
return Repository(repo_uri_path, subdir=subdir)
230+
return RemoteGitRepository(repo_uri, subdir=subdir)
183231

184232

185233
class _ClonedRepository(Repository):
186-
def __init__(self, clone_uri: str, rev: str | None = None):
234+
def __init__(self, clone_uri: str, rev: str | None = None, subdir: Path | None = None):
187235
self._clone_uri: str = clone_uri
188236
self._entries: int = 0
189237
self._tempdir: TemporaryDirectory | None = None
238+
if subdir is not None and subdir.is_absolute():
239+
msg = f"Invalid subdirectory {subdir!s}: the path must be relative, not absolute"
240+
raise ValueError(msg)
190241
if GIT_PATH is None:
191242
msg = (
192243
f"Error cloning {self._clone_uri}: `git` binary could not be found;please make sure it is in your PATH"
193244
)
194245
raise RepositoryError(msg)
195-
super().__init__(Path(), rev=rev)
246+
super().__init__(Path(), rev=rev, subdir=subdir)
196247

197248
def __enter__(self) -> Self:
198249
self._entries += 1
@@ -329,12 +380,12 @@ def __str__(self) -> str:
329380

330381

331382
class RemoteGitRepository(_ClonedRepository):
332-
def __init__(self, url: str):
383+
def __init__(self, url: str, subdir: Path | None = None):
333384
self.url: str = url
334-
super().__init__(url, rev="")
385+
super().__init__(url, rev="", subdir=subdir)
335386

336387
def __repr__(self) -> str:
337-
return f"{self.__class__.__name__}({self.url!r})"
388+
return f"{self.__class__.__name__}({self.url!r}, subdir={self.subdir!r})"
338389

339390
@property
340391
def is_git(self) -> bool:

0 commit comments

Comments
 (0)