Skip to content

Commit 9be40b5

Browse files
authored
Merge pull request #17 from trailofbits/diffs
Bugfixes in source output, and output in diff format by default
2 parents 9fb2edb + 2ea4dff commit 9be40b5

File tree

2 files changed

+264
-32
lines changed

2 files changed

+264
-32
lines changed

src/vendetect/_cli.py

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import TextIO
99

1010
from rich import traceback
11-
from rich.console import Console
11+
from rich.console import Console, ConsoleRenderable
1212
from rich.logging import RichHandler
1313
from rich.panel import Panel
1414
from rich.progress import Progress, TaskID
@@ -17,6 +17,7 @@
1717
from rich.text import Text
1818

1919
from .detector import Detection, Status, VenDetector, get_lexer_for_filename
20+
from .diffing import CollapsedDiffLine, Differ, DiffLineStatus, Document
2021
from .errors import VendetectError
2122
from .repo import File, Repository
2223

@@ -145,11 +146,12 @@ def output_json(
145146
json.dump(results, output, indent=2)
146147

147148

148-
def output_rich(
149+
def output_rich( # noqa: PLR0912 PLR0915 C901
149150
detections: Iterable[Detection],
150151
console: Console,
151152
min_similarity: float = 0.5,
152153
output_file: TextIO | None = None,
154+
collapse_identical_lines_threshold: int = 10,
153155
) -> None:
154156
# If an output file is specified, create a new Console for it
155157
file_console = Console(file=output_file) if output_file else console
@@ -178,50 +180,73 @@ def read_file_content(file: File) -> str:
178180
return file.path.read_text()
179181

180182
try:
181-
test_content = read_file_content(d.test)
182-
source_content = read_file_content(d.source)
183+
test_doc = Document.from_file(d.test)
184+
source_doc = Document.from_file(d.source)
183185

184186
# Create a side-by-side view of the detected slices
185187
test_slices = d.comparison.slices1
186188
source_slices = d.comparison.slices2
187189

188190
match_table = Table()
189191
match_table.add_column(f"{d.test.relative_path.name!s}", style="cyan")
192+
match_table.add_column(" ")
190193
match_table.add_column(f"{d.source.relative_path.name!s}", style="green")
191194
first = True
192195

193-
for (test_start, test_end), (source_start, source_end) in zip(test_slices, source_slices, strict=False):
194-
# Extract the content for the detected slices
195-
test_lines = test_content.splitlines()
196-
source_lines = source_content.splitlines()
197-
198-
# Convert character positions to line numbers (approximate)
199-
test_slice_content = "\n".join(test_lines[max(0, test_start - 10) : test_end + 10])
200-
source_slice_content = "\n".join(source_lines[max(0, source_start - 10) : source_end + 10])
201-
202-
# Create syntax-highlighted code panels
203-
test_syntax = Syntax(
204-
test_slice_content,
205-
lexer=get_lexer_for_filename(d.test.relative_path.name),
206-
line_numbers=True,
207-
start_line=max(1, test_start - 10),
208-
highlight_lines=set(range(max(1, test_start), test_end + 1)),
209-
)
210-
211-
source_syntax = Syntax(
212-
source_slice_content,
213-
lexer=get_lexer_for_filename(d.source.relative_path.name),
214-
line_numbers=True,
215-
start_line=max(1, source_start - 10),
216-
highlight_lines=set(range(max(1, source_start), source_end + 1)),
217-
)
196+
test_lexer = get_lexer_for_filename(d.test.relative_path.name)
197+
source_lexer = get_lexer_for_filename(d.source.relative_path.name)
218198

199+
for (test_start_offset, test_end_offset), (source_start_offset, source_end_offset) in zip(
200+
test_slices, source_slices, strict=False
201+
):
219202
if first:
220203
first = False
221204
else:
222-
match_table.add_row(Text(" ⋮", style="dim"), Text(" ⋮", style="dim"))
223-
224-
match_table.add_row(test_syntax, source_syntax)
205+
match_table.add_row(Text(" ⋮", style="dim"), Text("⋮", style="dim"), Text(" ⋮", style="dim"))
206+
207+
for diff_line in Differ(test=test_doc, source=source_doc).diff_from_offsets(
208+
test_start_offset=test_start_offset,
209+
test_end_offset=test_end_offset,
210+
source_start_offset=source_start_offset,
211+
source_end_offset=source_end_offset,
212+
collapse_identical_lines_threshold=collapse_identical_lines_threshold,
213+
):
214+
if isinstance(diff_line, CollapsedDiffLine):
215+
test_msg = diff_line.left
216+
source_msg = diff_line.right
217+
match_table.add_row(
218+
Text(f" {' ' * (len(test_msg) // 2)}⋮", "red reverse bold"),
219+
Text("←", style="red reverse bold"),
220+
Text(f" {' ' * (len(source_msg) // 2)}⋮", "red reverse bold"),
221+
)
222+
match_table.add_row(
223+
Text(f" {test_msg}", style="red reverse bold"),
224+
Text("←", style="red reverse bold"),
225+
Text(f" {source_msg}", style="red reverse bold"),
226+
)
227+
match_table.add_row(
228+
Text(f" {' ' * (len(test_msg) // 2)}⋮", "red reverse bold"),
229+
Text("←", style="red reverse bold"),
230+
Text(f" {' ' * (len(source_msg) // 2)}⋮", "red reverse bold"),
231+
)
232+
else:
233+
if diff_line.status == DiffLineStatus.COPIED:
234+
status_col = Text("←", style="red reverse bold")
235+
else:
236+
status_col = Text("✓", style="green reverse")
237+
if diff_line.left is None:
238+
left: ConsoleRenderable = Text("")
239+
else:
240+
left = Syntax(
241+
diff_line.left, lexer=test_lexer, line_numbers=True, start_line=diff_line.left_line
242+
)
243+
if diff_line.right is None:
244+
right: ConsoleRenderable = Text("")
245+
else:
246+
right = Syntax(
247+
diff_line.right, lexer=source_lexer, line_numbers=True, start_line=diff_line.right_line
248+
)
249+
match_table.add_row(left, status_col, right)
225250

226251
if not first:
227252
context_panel = Panel.fit(

src/vendetect/diffing.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
from difflib import ndiff
2+
from enum import Enum
3+
from typing import Iterable, Iterator, Self # noqa: UP035
4+
5+
from .repo import File
6+
7+
8+
class DiffLineStatus(Enum):
9+
COPIED = "COPIED"
10+
DIFFERENT = "DIFFERENT"
11+
12+
13+
class Rounding(Enum):
14+
DOWN = "DOWN"
15+
UP = "UP"
16+
17+
18+
class DiffLine:
19+
def __init__(self, left: str | None, status: DiffLineStatus, right: str | None, left_line: int, right_line: int):
20+
self.left: str | None = left
21+
self.status: DiffLineStatus = status
22+
self.right: str | None = right
23+
self.left_line: int = left_line
24+
self.right_line: int = right_line
25+
26+
27+
class CollapsedDiffLine(DiffLine):
28+
def __init__(self, left_start_line: int, right_start_line: int, num_identical_lines: int):
29+
self.left_start_line: int = left_start_line
30+
self.right_start_line: int = right_start_line
31+
self.num_identical_lines: int = num_identical_lines
32+
self.left: str
33+
self.right: str
34+
super().__init__(
35+
f"<{self.num_identical_lines} identical lines starting on line {self.left_start_line}>",
36+
DiffLineStatus.COPIED,
37+
f"<{self.num_identical_lines} identical lines starting on line {self.right_start_line}>",
38+
left_line=left_start_line,
39+
right_line=right_start_line,
40+
)
41+
42+
43+
class Document:
44+
def __init__(self, lines: Iterable[str], start_line: int = 1):
45+
self.lines: tuple[str, ...] = tuple(lines)
46+
self.start_line: int = start_line
47+
self._line_start_offsets: list[int] = [0]
48+
for line in self.lines[:-1]:
49+
self._line_start_offsets.append(self._line_start_offsets[-1] + len(line))
50+
51+
def __len__(self) -> int:
52+
return len(self.lines)
53+
54+
def __iter__(self) -> Iterator[str]:
55+
return iter(self.lines)
56+
57+
def __getitem__(self, index: int | slice) -> str | tuple[str, ...]:
58+
return self.lines[index]
59+
60+
def get_line(self, byte_offset: int, rounding: Rounding = Rounding.DOWN, min_line: int = 0) -> int:
61+
if byte_offset < 0:
62+
byte_offset = self._line_start_offsets[-1] + len(self.lines[-1]) + byte_offset
63+
if rounding == Rounding.DOWN:
64+
start = max(min_line, 0)
65+
while start + 1 < len(self._line_start_offsets) and self._line_start_offsets[start + 1] < byte_offset:
66+
start += 1
67+
return start
68+
# round up
69+
end = max(min_line, 0)
70+
while end < len(self._line_start_offsets) and self._line_start_offsets[end] < byte_offset:
71+
end += 1
72+
return end
73+
74+
@classmethod
75+
def from_str(cls, text: str) -> Self:
76+
return cls(text.splitlines(keepends=True))
77+
78+
@classmethod
79+
def from_file(cls, file: File) -> Self:
80+
with file.repo:
81+
return cls.from_str(file.path.read_text())
82+
83+
84+
class DiffContext:
85+
def __init__(
86+
self,
87+
test_start_line: int = 0,
88+
test_end_line: int = -1,
89+
source_start_line: int = 0,
90+
source_end_line: int = -1,
91+
collapse_identical_lines_threshold: int = 10,
92+
):
93+
self.test_start_line: int = test_start_line
94+
self.test_end_line: int = test_end_line
95+
self.source_start_line: int = source_start_line
96+
self.source_end_line: int = source_end_line
97+
self.collapse_identical_lines_threshold: int = collapse_identical_lines_threshold
98+
self.rows: list[DiffLine] = []
99+
self.same_lines: int = 0
100+
self.test_line: int = 0
101+
self.source_line: int = 0
102+
103+
def add_test_row(self, code: str) -> None:
104+
insertion_point: DiffLine | None = None
105+
for line in reversed(self.rows):
106+
if line.left is not None:
107+
break
108+
insertion_point = line
109+
if insertion_point is None:
110+
self.rows.append(DiffLine(code, DiffLineStatus.DIFFERENT, None, self.test_line, -1))
111+
else:
112+
insertion_point.left = code
113+
insertion_point.left_line = self.test_line
114+
self.same_lines = 0
115+
self.test_line += 1
116+
117+
def add_source_row(self, code: str) -> None:
118+
insertion_point: DiffLine | None = None
119+
for line in reversed(self.rows):
120+
if line.right is not None:
121+
break
122+
insertion_point = line
123+
if insertion_point is None:
124+
self.rows.append(DiffLine(None, DiffLineStatus.DIFFERENT, code, -1, self.source_line))
125+
else:
126+
insertion_point.right = code
127+
insertion_point.right_line = self.source_line
128+
self.same_lines = 0
129+
self.source_line += 1
130+
131+
def add_identical_row(self, num_identical: int) -> Iterator[DiffLine]:
132+
for _ in range(num_identical):
133+
self.rows.pop()
134+
yield from self.rows
135+
self.rows.clear()
136+
yield CollapsedDiffLine(self.test_line, self.source_line, num_identical)
137+
138+
139+
class Differ:
140+
def __init__(
141+
self,
142+
test: Document,
143+
source: Document,
144+
):
145+
self.test: Document = test
146+
self.source: Document = source
147+
148+
def diff(self, context: DiffContext | None = None) -> Iterator[DiffLine]:
149+
if context is None:
150+
context = DiffContext()
151+
test_slice_content = self.test[max(0, context.test_start_line) : context.test_end_line]
152+
source_slice_content = self.source[max(0, context.source_start_line) : context.source_end_line]
153+
154+
context.test_line = max(1, context.test_start_line + 1)
155+
context.source_line = max(1, context.source_start_line + 1)
156+
context.rows = []
157+
context.same_lines = 0
158+
159+
for diff_line in ndiff(test_slice_content, source_slice_content):
160+
if diff_line.startswith(" "):
161+
context.same_lines += 1
162+
copied_str = diff_line[2:].rstrip()
163+
context.rows.append(
164+
DiffLine(
165+
copied_str,
166+
DiffLineStatus.COPIED,
167+
copied_str,
168+
context.test_line,
169+
context.source_line,
170+
)
171+
)
172+
context.test_line += 1
173+
context.source_line += 1
174+
else:
175+
if diff_line[:2] in ("- ", "+ ") and context.same_lines >= context.collapse_identical_lines_threshold:
176+
yield from context.add_identical_row(context.same_lines)
177+
if diff_line.startswith("- "):
178+
context.add_test_row(diff_line[2:].rstrip())
179+
elif diff_line.startswith("+ "):
180+
context.add_source_row(diff_line[2:].rstrip())
181+
182+
if context.same_lines >= context.collapse_identical_lines_threshold:
183+
yield from context.add_identical_row(context.same_lines)
184+
185+
yield from context.rows
186+
187+
def diff_from_offsets(
188+
self,
189+
test_start_offset: int = 0,
190+
test_end_offset: int = -1,
191+
source_start_offset: int = 0,
192+
source_end_offset: int = -1,
193+
collapse_identical_lines_threshold: int = 10,
194+
) -> Iterator[DiffLine]:
195+
test_start = self.test.get_line(test_start_offset, rounding=Rounding.DOWN)
196+
test_end = self.test.get_line(test_end_offset, rounding=Rounding.UP, min_line=test_start + 1)
197+
source_start = self.source.get_line(source_start_offset, rounding=Rounding.DOWN)
198+
source_end = self.source.get_line(source_end_offset, rounding=Rounding.UP, min_line=source_start + 1)
199+
return self.diff(
200+
DiffContext(
201+
test_start_line=test_start,
202+
test_end_line=test_end,
203+
source_start_line=source_start,
204+
source_end_line=source_end,
205+
collapse_identical_lines_threshold=collapse_identical_lines_threshold,
206+
)
207+
)

0 commit comments

Comments
 (0)