Skip to content

Commit 662596f

Browse files
committed
🎨 [Improvement] Modify the test cases and consider Euclidean distance, etc.
1 parent 194c347 commit 662596f

File tree

1 file changed

+23
-3
lines changed

1 file changed

+23
-3
lines changed

‎backend/utils/document_vector_utils.py‎

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,10 @@ def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_em
242242
if not clusters or not doc_embeddings:
243243
return clusters
244244

245+
# Skip merging if there's only one cluster (nothing to merge)
246+
if len(clusters) <= 1:
247+
return clusters
248+
245249
# Build a mapping from doc_id to its current cluster
246250
doc_to_cluster = {}
247251
for cluster_id, doc_ids in clusters.items():
@@ -275,10 +279,26 @@ def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_em
275279
cluster1 = doc_to_cluster.get(doc_id1)
276280
cluster2 = doc_to_cluster.get(doc_id2)
277281

278-
# If they are in different clusters, merge them
282+
# Only merge if they are in different clusters AND truly duplicates
283+
# Check both cosine similarity AND Euclidean distance to prevent false positives
279284
if cluster1 is not None and cluster2 is not None and cluster1 != cluster2:
280-
merged_pairs.append((doc_id1, doc_id2, cluster1, cluster2, similarity))
281-
logger.info(f"Found duplicate documents: {doc_id1} and {doc_id2} (similarity: {similarity:.4f}) in different clusters {cluster1} and {cluster2}")
285+
# Calculate Euclidean distance to ensure they're truly duplicates
286+
# Documents that are just in the same direction but far apart should not be merged
287+
euclidean_distance = np.linalg.norm(embedding1 - embedding2)
288+
289+
# Normalize embeddings to get their magnitudes
290+
norm1 = np.linalg.norm(embedding1)
291+
norm2 = np.linalg.norm(embedding2)
292+
avg_norm = (norm1 + norm2) / 2.0
293+
294+
# Relative distance threshold: if distance is less than 1% of average magnitude,
295+
# they are likely true duplicates (same content, different path_or_url)
296+
# This prevents merging documents that are just in similar directions
297+
relative_distance_threshold = 0.01 * avg_norm if avg_norm > 0 else 0.1
298+
299+
if euclidean_distance <= relative_distance_threshold:
300+
merged_pairs.append((doc_id1, doc_id2, cluster1, cluster2, similarity))
301+
logger.info(f"Found duplicate documents: {doc_id1} and {doc_id2} (similarity: {similarity:.4f}, distance: {euclidean_distance:.4f}) in different clusters {cluster1} and {cluster2}")
282302

283303
# Merge duplicate documents into the same cluster
284304
if merged_pairs:

0 commit comments

Comments
 (0)