Skip to content

Commit 30605bd

Browse files
ai-symphonyai-symphony
authored andcommitted
Using an approximation for the calculate_edit_distance for scalability
1 parent 4b099e3 commit 30605bd

File tree

1 file changed

+17
-19
lines changed

1 file changed

+17
-19
lines changed

openevolve/utils/code_utils.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for code parsing, diffing, and manipulation
33
"""
44

5+
import difflib
56
import re
67
from typing import Dict, List, Optional, Tuple, Union
78

@@ -146,38 +147,35 @@ def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str:
146147

147148
def calculate_edit_distance(code1: str, code2: str) -> int:
148149
"""
149-
Calculate the Levenshtein edit distance between two code snippets
150+
Calculate an approximate "edit distance" between two code snippets
151+
using difflib.SequenceMatcher. This is generally faster than a pure
152+
Python Levenshtein implementation and provides an approximation
153+
of the edit distance by scaling the dissimilarity ratio.
150154
151155
Args:
152156
code1: First code snippet
153157
code2: Second code snippet
154158
155159
Returns:
156-
Edit distance (number of operations needed to transform code1 into code2)
160+
An approximate integer for the "edit distance".
157161
"""
158162
if code1 == code2:
159163
return 0
160164

161-
# Simple implementation of Levenshtein distance
162-
m, n = len(code1), len(code2)
163-
dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
165+
# SequenceMatcher finds similarity based on common subsequences.
166+
# The ratio() method returns a similarity score between 0.0 and 1.0.
167+
matcher = difflib.SequenceMatcher(None, code1, code2)
168+
similarity_ratio = matcher.ratio()
164169

165-
for i in range(m + 1):
166-
dp[i][0] = i
170+
# Convert similarity to a dissimilarity measure (0.0 to 1.0)
171+
dissimilarity = 1.0 - similarity_ratio
167172

168-
for j in range(n + 1):
169-
dp[0][j] = j
173+
# Scale the dissimilarity by the length of the longer string
174+
# to get an integer "distance" that is somewhat analogous to
175+
# Levenshtein distance. This is an approximation.
176+
approx_distance = round(dissimilarity * max(len(code1), len(code2)))
170177

171-
for i in range(1, m + 1):
172-
for j in range(1, n + 1):
173-
cost = 0 if code1[i - 1] == code2[j - 1] else 1
174-
dp[i][j] = min(
175-
dp[i - 1][j] + 1, # deletion
176-
dp[i][j - 1] + 1, # insertion
177-
dp[i - 1][j - 1] + cost, # substitution
178-
)
179-
180-
return dp[m][n]
178+
return approx_distance
181179

182180

183181
def extract_code_language(code: str) -> str:

0 commit comments

Comments
 (0)