Using an approximation for the calculate_edit_distance for scalability

ai-symphony · ai-symphony · commit 30605bdacc62 · 2025-06-05T21:25:52.000-07:00
diff --git a/openevolve/utils/code_utils.py b/openevolve/utils/code_utils.py
@@ -2,6 +2,7 @@
 Utilities for code parsing, diffing, and manipulation
 """
 
+import difflib
 import re
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -146,38 +147,35 @@ def format_diff_summary(diff_blocks: List[Tuple[str, str]]) -> str:
 
 def calculate_edit_distance(code1: str, code2: str) -> int:
     """
-    Calculate the Levenshtein edit distance between two code snippets
+    Calculate an approximate "edit distance" between two code snippets
+    using difflib.SequenceMatcher. This is generally faster than a pure
+    Python Levenshtein implementation and provides an approximation
+    of the edit distance by scaling the dissimilarity ratio.
 
     Args:
         code1: First code snippet
         code2: Second code snippet
 
     Returns:
-        Edit distance (number of operations needed to transform code1 into code2)
+        An approximate integer for the "edit distance".
     """
     if code1 == code2:
         return 0
 
-    # Simple implementation of Levenshtein distance
-    m, n = len(code1), len(code2)
-    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
+    # SequenceMatcher finds similarity based on common subsequences.
+    # The ratio() method returns a similarity score between 0.0 and 1.0.
+    matcher = difflib.SequenceMatcher(None, code1, code2)
+    similarity_ratio = matcher.ratio()
 
-    for i in range(m + 1):
-        dp[i][0] = i
+    # Convert similarity to a dissimilarity measure (0.0 to 1.0)
+    dissimilarity = 1.0 - similarity_ratio
 
-    for j in range(n + 1):
-        dp[0][j] = j
+    # Scale the dissimilarity by the length of the longer string
+    # to get an integer "distance" that is somewhat analogous to
+    # Levenshtein distance. This is an approximation.
+    approx_distance = round(dissimilarity * max(len(code1), len(code2)))
 
-    for i in range(1, m + 1):
-        for j in range(1, n + 1):
-            cost = 0 if code1[i - 1] == code2[j - 1] else 1
-            dp[i][j] = min(
-                dp[i - 1][j] + 1,  # deletion
-                dp[i][j - 1] + 1,  # insertion
-                dp[i - 1][j - 1] + cost,  # substitution
-            )
-
-    return dp[m][n]
+    return approx_distance
 
 
 def extract_code_language(code: str) -> str: