@@ -242,6 +242,10 @@ def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_em
242242 if not clusters or not doc_embeddings :
243243 return clusters
244244
245+ # Skip merging if there's only one cluster (nothing to merge)
246+ if len (clusters ) <= 1 :
247+ return clusters
248+
245249 # Build a mapping from doc_id to its current cluster
246250 doc_to_cluster = {}
247251 for cluster_id , doc_ids in clusters .items ():
@@ -275,10 +279,26 @@ def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_em
275279 cluster1 = doc_to_cluster .get (doc_id1 )
276280 cluster2 = doc_to_cluster .get (doc_id2 )
277281
278- # If they are in different clusters, merge them
282+ # Only merge if they are in different clusters AND truly duplicates
283+ # Check both cosine similarity AND Euclidean distance to prevent false positives
279284 if cluster1 is not None and cluster2 is not None and cluster1 != cluster2 :
280- merged_pairs .append ((doc_id1 , doc_id2 , cluster1 , cluster2 , similarity ))
281- logger .info (f"Found duplicate documents: { doc_id1 } and { doc_id2 } (similarity: { similarity :.4f} ) in different clusters { cluster1 } and { cluster2 } " )
285+ # Calculate Euclidean distance to ensure they're truly duplicates
286+ # Documents that are just in the same direction but far apart should not be merged
287+ euclidean_distance = np .linalg .norm (embedding1 - embedding2 )
288+
289+ # Normalize embeddings to get their magnitudes
290+ norm1 = np .linalg .norm (embedding1 )
291+ norm2 = np .linalg .norm (embedding2 )
292+ avg_norm = (norm1 + norm2 ) / 2.0
293+
294+ # Relative distance threshold: if distance is less than 1% of average magnitude,
295+ # they are likely true duplicates (same content, different path_or_url)
296+ # This prevents merging documents that are just in similar directions
297+ relative_distance_threshold = 0.01 * avg_norm if avg_norm > 0 else 0.1
298+
299+ if euclidean_distance <= relative_distance_threshold :
300+ merged_pairs .append ((doc_id1 , doc_id2 , cluster1 , cluster2 , similarity ))
301+ logger .info (f"Found duplicate documents: { doc_id1 } and { doc_id2 } (similarity: { similarity :.4f} , distance: { euclidean_distance :.4f} ) in different clusters { cluster1 } and { cluster2 } " )
282302
283303 # Merge duplicate documents into the same cluster
284304 if merged_pairs :
0 commit comments