|
2 | 2 | Utilities for code parsing, diffing, and manipulation |
3 | 3 | """ |
4 | 4 |
|
| 5 | +import difflib |
5 | 6 | import re |
6 | 7 | from typing import Dict, List, Optional, Tuple, Union |
7 | 8 |
|
@@ -146,38 +147,35 @@ def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str: |
146 | 147 |
|
147 | 148 | def calculate_edit_distance(code1: str, code2: str) -> int: |
148 | 149 | """ |
149 | | - Calculate the Levenshtein edit distance between two code snippets |
| 150 | + Calculate an approximate "edit distance" between two code snippets |
| 151 | + using difflib.SequenceMatcher. This is generally faster than a pure |
| 152 | + Python Levenshtein implementation and provides an approximation |
| 153 | + of the edit distance by scaling the dissimilarity ratio. |
150 | 154 |
|
151 | 155 | Args: |
152 | 156 | code1: First code snippet |
153 | 157 | code2: Second code snippet |
154 | 158 |
|
155 | 159 | Returns: |
156 | | - Edit distance (number of operations needed to transform code1 into code2) |
| 160 | + An approximate integer for the "edit distance". |
157 | 161 | """ |
158 | 162 | if code1 == code2: |
159 | 163 | return 0 |
160 | 164 |
|
161 | | - # Simple implementation of Levenshtein distance |
162 | | - m, n = len(code1), len(code2) |
163 | | - dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)] |
| 165 | + # SequenceMatcher finds similarity based on common subsequences. |
| 166 | + # The ratio() method returns a similarity score between 0.0 and 1.0. |
| 167 | + matcher = difflib.SequenceMatcher(None, code1, code2) |
| 168 | + similarity_ratio = matcher.ratio() |
164 | 169 |
|
165 | | - for i in range(m + 1): |
166 | | - dp[i][0] = i |
| 170 | + # Convert similarity to a dissimilarity measure (0.0 to 1.0) |
| 171 | + dissimilarity = 1.0 - similarity_ratio |
167 | 172 |
|
168 | | - for j in range(n + 1): |
169 | | - dp[0][j] = j |
| 173 | + # Scale the dissimilarity by the length of the longer string |
| 174 | + # to get an integer "distance" that is somewhat analogous to |
| 175 | + # Levenshtein distance. This is an approximation. |
| 176 | + approx_distance = round(dissimilarity * max(len(code1), len(code2))) |
170 | 177 |
|
171 | | - for i in range(1, m + 1): |
172 | | - for j in range(1, n + 1): |
173 | | - cost = 0 if code1[i - 1] == code2[j - 1] else 1 |
174 | | - dp[i][j] = min( |
175 | | - dp[i - 1][j] + 1, # deletion |
176 | | - dp[i][j - 1] + 1, # insertion |
177 | | - dp[i - 1][j - 1] + cost, # substitution |
178 | | - ) |
179 | | - |
180 | | - return dp[m][n] |
| 178 | + return approx_distance |
181 | 179 |
|
182 | 180 |
|
183 | 181 | def extract_code_language(code: str) -> str: |
|
0 commit comments