Skip to content

Commit 573fabf

Browse files
committed
Adds fuzzy matching based on edit distance
1 parent 219f1ba commit 573fabf

File tree

3 files changed

+26
-3
lines changed

3 files changed

+26
-3
lines changed

src/vendetect/_cli.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from rich.text import Text
1818

1919
from .detector import Detection, Status, VenDetector, get_lexer_for_filename
20-
from .diffing import CollapsedDiffLine, Differ, DiffLineStatus, Document
20+
from .diffing import CollapsedDiffLine, Differ, DiffLineStatus, Document, normalized_edit_distance
2121
from .errors import VendetectError
2222
from .repo import File, Repository
2323

@@ -233,7 +233,13 @@ def read_file_content(file: File) -> str:
233233
if diff_line.status == DiffLineStatus.COPIED:
234234
status_col = Text("←", style="red reverse bold")
235235
else:
236-
status_col = Text("✓", style="green reverse")
236+
# calculate the normalized edit distance
237+
if diff_line.left is not None and diff_line.right is not None and \
238+
normalized_edit_distance(diff_line.left, diff_line.right) < 0.75:
239+
# the lines are at least 25% similar!
240+
status_col = Text("↜", style="yellow reverse")
241+
else:
242+
status_col = Text("✓", style="green reverse")
237243
if diff_line.left is None:
238244
left: ConsoleRenderable = Text("")
239245
else:

src/vendetect/detector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __repr__(self) -> str:
6363

6464
def byte_offset_slice_to_lines_slice(self, to_convert: Slice) -> Slice:
6565
from_line = self.file.get_line(to_convert.from_index, rounding=Rounding.DOWN)
66-
to_line = self.file.get_line(to_convert.to_index, rounding=Rounding.UP, min_line=from_line+1)
66+
to_line = self.file.get_line(to_convert.to_index, rounding=Rounding.UP, min_line=from_line + 1)
6767
return Slice(from_line, to_line)
6868

6969
def to_str(self, max_slices: int = -1) -> str:

src/vendetect/diffing.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,20 @@ def diff_from_offsets(
200200
collapse_identical_lines_threshold=collapse_identical_lines_threshold,
201201
)
202202
)
203+
204+
205+
def edit_distance(s1: str, s2: str) -> int:
206+
"""Calculates the minimum number of edits to convert s1 into s2"""
207+
# HACK: insert a newline after every character, then use the Myers Diff algorithm since it's built into Python;
208+
# the result should be the same as Levenshtein distance!
209+
newline = "\n"
210+
s1 = f"{newline.join(s1)}\n"
211+
s2 = f"{newline.join(s2)}\n"
212+
return sum(1 for diff_line in ndiff(s1, s2) if diff_line[:2] in ("- ", "+ "))
213+
214+
215+
def normalized_edit_distance(s1: str, s2: str) -> float:
216+
if s1 == s2:
217+
return 0
218+
ed = edit_distance(s1, s2)
219+
return ed / max(len(s1), len(s2))

0 commit comments

Comments
 (0)