Skip to content

Commit 6053855

Browse files
authored
🎨 When uploading the same document multiple times, the knowledge base summary will classify them into multiple categories.
2 parents a75a6c2 + 1ab0968 commit 6053855

File tree

3 files changed

+262
-8
lines changed

3 files changed

+262
-8
lines changed

‎backend/utils/document_vector_utils.py‎

Lines changed: 175 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from jinja2 import Template, StrictUndefined
1818
from sklearn.cluster import KMeans
1919
from sklearn.metrics import silhouette_score
20+
from sklearn.metrics.pairwise import cosine_similarity
2021

2122
from consts.const import LANGUAGE
2223

@@ -77,7 +78,15 @@ def get_documents_from_es(index_name: str, es_core, sample_doc_count: int = 200)
7778
"query": {
7879
"term": {"path_or_url": path_or_url}
7980
},
80-
"size": chunk_count # Get all chunks
81+
"size": chunk_count, # Get all chunks
82+
"sort": [
83+
{
84+
"create_time": {
85+
"order": "asc",
86+
"missing": "_last" # Put documents without create_time at the end
87+
}
88+
}
89+
]
8190
}
8291

8392
chunks_response = es_core.client.search(index=index_name, body=chunks_query)
@@ -124,10 +133,9 @@ def calculate_document_embedding(doc_chunks: List[Dict], use_weighted: bool = Tr
124133
embeddings.append(np.array(chunk_embedding))
125134

126135
if use_weighted:
127-
# Weight by content length
136+
# Weight by content length only (removed position-based weight to reduce order dependency)
128137
content_length = len(chunk.get('content', ''))
129-
position_weight = 1.5 if len(embeddings) == 1 else 1.0 # First chunk has higher weight
130-
weight = position_weight * content_length
138+
weight = content_length
131139
weights.append(weight)
132140

133141
if not embeddings:
@@ -217,6 +225,162 @@ def auto_determine_k(embeddings: np.ndarray, min_k: int = 3, max_k: int = 15) ->
217225
return heuristic_k
218226

219227

228+
def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_embeddings: Dict[str, np.ndarray], similarity_threshold: float = 0.98) -> Dict[int, List[str]]:
229+
"""
230+
Post-process clusters to merge duplicate documents (same content but different path_or_url)
231+
that were incorrectly split into different clusters.
232+
233+
Args:
234+
clusters: Dictionary mapping cluster IDs to lists of document IDs
235+
doc_embeddings: Dictionary mapping document IDs to their embeddings
236+
similarity_threshold: Cosine similarity threshold to consider documents as duplicates (default: 0.98)
237+
238+
Returns:
239+
Updated clusters dictionary with duplicate documents merged
240+
"""
241+
try:
242+
if not clusters or not doc_embeddings:
243+
return clusters
244+
245+
# Skip merging if there's only one cluster (nothing to merge)
246+
if len(clusters) <= 1:
247+
return clusters
248+
249+
# Build a mapping from doc_id to its current cluster
250+
doc_to_cluster = {}
251+
for cluster_id, doc_ids in clusters.items():
252+
for doc_id in doc_ids:
253+
doc_to_cluster[doc_id] = cluster_id
254+
255+
# Find duplicate pairs with high similarity
256+
doc_ids_list = list(doc_embeddings.keys())
257+
merged_pairs = []
258+
259+
for i, doc_id1 in enumerate(doc_ids_list):
260+
if doc_id1 not in doc_embeddings:
261+
continue
262+
263+
embedding1 = doc_embeddings[doc_id1]
264+
265+
for j, doc_id2 in enumerate(doc_ids_list[i+1:], start=i+1):
266+
if doc_id2 not in doc_embeddings:
267+
continue
268+
269+
embedding2 = doc_embeddings[doc_id2]
270+
271+
# Calculate cosine similarity
272+
similarity = cosine_similarity(
273+
embedding1.reshape(1, -1),
274+
embedding2.reshape(1, -1)
275+
)[0][0]
276+
277+
# If similarity is very high, they are likely duplicates
278+
if similarity >= similarity_threshold:
279+
cluster1 = doc_to_cluster.get(doc_id1)
280+
cluster2 = doc_to_cluster.get(doc_id2)
281+
282+
# Only merge if they are in different clusters AND truly duplicates
283+
# Check both cosine similarity AND Euclidean distance to prevent false positives
284+
if cluster1 is not None and cluster2 is not None and cluster1 != cluster2:
285+
# Calculate Euclidean distance to ensure they're truly duplicates
286+
# Documents that are just in the same direction but far apart should not be merged
287+
euclidean_distance = np.linalg.norm(embedding1 - embedding2)
288+
289+
# Normalize embeddings to get their magnitudes
290+
norm1 = np.linalg.norm(embedding1)
291+
norm2 = np.linalg.norm(embedding2)
292+
avg_norm = (norm1 + norm2) / 2.0
293+
294+
# Relative distance threshold: if distance is less than 1% of average magnitude,
295+
# they are likely true duplicates (same content, different path_or_url)
296+
# This prevents merging documents that are just in similar directions
297+
relative_distance_threshold = 0.01 * avg_norm if avg_norm > 0 else 0.1
298+
299+
if euclidean_distance <= relative_distance_threshold:
300+
merged_pairs.append((doc_id1, doc_id2, cluster1, cluster2, similarity))
301+
logger.info(f"Found duplicate documents: {doc_id1} and {doc_id2} (similarity: {similarity:.4f}, distance: {euclidean_distance:.4f}) in different clusters {cluster1} and {cluster2}")
302+
303+
# Merge duplicate documents into the same cluster
304+
if merged_pairs:
305+
logger.info(f"Merging {len(merged_pairs)} pairs of duplicate documents")
306+
307+
# Build a graph of duplicate relationships using union-find
308+
parent = {}
309+
310+
def find(x):
311+
if x not in parent:
312+
parent[x] = x
313+
if parent[x] != x:
314+
parent[x] = find(parent[x])
315+
return parent[x]
316+
317+
def union(x, y):
318+
px, py = find(x), find(y)
319+
if px != py:
320+
parent[px] = py
321+
322+
# Build union-find structure
323+
for doc_id1, doc_id2, _, _, _ in merged_pairs:
324+
union(doc_id1, doc_id2)
325+
326+
# Group documents by their root parent
327+
# Only include documents that are part of duplicate pairs
328+
duplicate_doc_ids = set()
329+
for doc_id1, doc_id2, _, _, _ in merged_pairs:
330+
duplicate_doc_ids.add(doc_id1)
331+
duplicate_doc_ids.add(doc_id2)
332+
333+
groups = {}
334+
for doc_id in duplicate_doc_ids:
335+
root = find(doc_id)
336+
if root not in groups:
337+
groups[root] = []
338+
groups[root].append(doc_id)
339+
340+
# Merge each group into the same cluster
341+
for root, doc_group in groups.items():
342+
if len(doc_group) < 2:
343+
continue
344+
345+
# Find all clusters containing documents in this group
346+
clusters_in_group = set()
347+
for doc_id in doc_group:
348+
if doc_id in doc_to_cluster:
349+
clusters_in_group.add(doc_to_cluster[doc_id])
350+
351+
if len(clusters_in_group) > 1:
352+
# Merge all documents to the smallest cluster ID
353+
target_cluster = min(clusters_in_group)
354+
355+
for doc_id in doc_group:
356+
current_cluster = doc_to_cluster.get(doc_id)
357+
if current_cluster is not None and current_cluster != target_cluster:
358+
# Move document to target cluster
359+
if current_cluster in clusters and doc_id in clusters[current_cluster]:
360+
clusters[current_cluster].remove(doc_id)
361+
if target_cluster not in clusters:
362+
clusters[target_cluster] = []
363+
if doc_id not in clusters[target_cluster]:
364+
clusters[target_cluster].append(doc_id)
365+
doc_to_cluster[doc_id] = target_cluster
366+
logger.debug(f"Moved {doc_id} from cluster {current_cluster} to cluster {target_cluster}")
367+
368+
# Remove empty clusters
369+
empty_clusters = [cid for cid, docs in clusters.items() if not docs]
370+
for cid in empty_clusters:
371+
del clusters[cid]
372+
logger.debug(f"Removed empty cluster {cid}")
373+
374+
logger.info(f"Successfully merged duplicate documents. Final cluster count: {len(clusters)}")
375+
376+
return clusters
377+
378+
except Exception as e:
379+
logger.error(f"Error merging duplicate documents: {str(e)}", exc_info=True)
380+
# Return original clusters if merge fails
381+
return clusters
382+
383+
220384
def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[int] = None) -> Dict[int, List[str]]:
221385
"""
222386
Cluster documents using K-means
@@ -266,6 +430,13 @@ def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[
266430
for cluster_id, docs in clusters.items():
267431
logger.info(f"Cluster {cluster_id}: {len(docs)} documents")
268432

433+
# Post-process: merge duplicate documents that were split into different clusters
434+
clusters = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
435+
436+
# Log final cluster sizes after merge
437+
for cluster_id, docs in clusters.items():
438+
logger.info(f"Final cluster {cluster_id}: {len(docs)} documents")
439+
269440
return clusters
270441

271442
except Exception as e:

‎test/backend/test_document_vector_utils.py‎

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
get_documents_from_es,
2828
process_documents_for_clustering,
2929
extract_cluster_content,
30-
analyze_cluster_coherence
30+
analyze_cluster_coherence,
31+
merge_duplicate_documents_in_clusters
3132
)
3233

3334

@@ -48,7 +49,7 @@ def test_calculate_document_embedding_simple_average(self):
4849
assert np.allclose(result, [4.0, 5.0, 6.0]) # Average of all embeddings
4950

5051
def test_calculate_document_embedding_weighted(self):
51-
"""Test weighted average embedding calculation"""
52+
"""Test weighted average embedding calculation (no position weight)"""
5253
chunks = [
5354
{'embedding': [1.0, 2.0], 'content': 'Short'},
5455
{'embedding': [3.0, 4.0], 'content': 'Long content with more words'},
@@ -59,6 +60,9 @@ def test_calculate_document_embedding_weighted(self):
5960

6061
assert result is not None
6162
assert len(result) == 2
63+
# Weight should be based on content length only, not position
64+
# First chunk should NOT have extra 1.5x weight
65+
# Result should be weighted average where longer chunks have more weight
6266

6367
def test_calculate_document_embedding_empty_chunks(self):
6468
"""Test handling of empty chunks"""
@@ -345,7 +349,8 @@ def test_get_documents_from_es_mock(self):
345349
'path_or_url': '/path/doc1.pdf',
346350
'filename': 'doc1.pdf',
347351
'content': 'Content 1',
348-
'embedding': [1.0, 2.0, 3.0]
352+
'embedding': [1.0, 2.0, 3.0],
353+
'create_time': '2024-01-01T00:00:00'
349354
}
350355
}
351356
]
@@ -370,6 +375,16 @@ def test_get_documents_from_es_mock(self):
370375
# Check that we have document data
371376
first_doc = list(result.values())[0]
372377
assert 'chunks' in first_doc
378+
379+
# Verify that sort parameter is included in the query
380+
call_args = mock_es_core.client.search.call_args
381+
if call_args:
382+
query_body = call_args[1].get('body') or call_args[0][1] if len(call_args[0]) > 1 else None
383+
if query_body and 'sort' in query_body:
384+
sort_config = query_body['sort']
385+
assert isinstance(sort_config, list)
386+
# Should have create_time sort
387+
assert any('create_time' in str(sort_item) for sort_item in sort_config)
373388

374389

375390
class TestProcessDocumentsForClustering:
@@ -465,6 +480,73 @@ def test_analyze_cluster_coherence(self):
465480
assert result['doc_count'] == 2
466481

467482

483+
class TestMergeDuplicateDocumentsInClusters:
484+
"""Test duplicate document merging in clusters"""
485+
486+
def test_merge_duplicate_documents_same_cluster(self):
487+
"""Test that documents in same cluster are not merged"""
488+
clusters = {
489+
0: ['doc1', 'doc2'],
490+
1: ['doc3']
491+
}
492+
doc_embeddings = {
493+
'doc1': np.array([1.0, 0.0]),
494+
'doc2': np.array([0.9, 0.1]),
495+
'doc3': np.array([0.0, 1.0])
496+
}
497+
498+
result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
499+
500+
# Documents with similarity < 0.98 should not be merged
501+
assert len(result) == 2
502+
assert 0 in result
503+
assert 1 in result
504+
505+
def test_merge_duplicate_documents_different_clusters(self):
506+
"""Test that highly similar documents in different clusters are merged"""
507+
clusters = {
508+
0: ['doc1'],
509+
1: ['doc2']
510+
}
511+
# Create two identical embeddings (duplicate documents)
512+
identical_embedding = np.array([1.0, 0.0, 0.0])
513+
doc_embeddings = {
514+
'doc1': identical_embedding,
515+
'doc2': identical_embedding.copy() # Same embedding
516+
}
517+
518+
result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
519+
520+
# Documents with similarity >= 0.98 should be merged into same cluster
521+
# Result should have fewer clusters
522+
assert len(result) <= 2
523+
524+
def test_merge_duplicate_documents_empty_clusters(self):
525+
"""Test handling of empty clusters"""
526+
clusters = {}
527+
doc_embeddings = {}
528+
529+
result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings)
530+
531+
assert result == {}
532+
533+
def test_merge_duplicate_documents_error_handling(self):
534+
"""Test error handling in merge function"""
535+
clusters = {
536+
0: ['doc1', 'doc2']
537+
}
538+
doc_embeddings = {
539+
'doc1': np.array([1.0, 0.0]),
540+
'doc2': np.array([0.9, 0.1])
541+
}
542+
543+
# Should not raise exception even with invalid similarity calculation
544+
result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=2.0)
545+
546+
# Should return clusters (possibly unchanged due to high threshold)
547+
assert isinstance(result, dict)
548+
549+
468550
if __name__ == '__main__':
469551
pytest.main([__file__, '-v'])
470552

‎test/backend/test_document_vector_utils_coverage.py‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
merge_cluster_summaries,
3030
calculate_document_embedding,
3131
auto_determine_k,
32-
kmeans_cluster_documents
32+
kmeans_cluster_documents,
33+
merge_duplicate_documents_in_clusters
3334
)
3435

3536

0 commit comments

Comments
 (0)