🎨 When uploading the same document multiple times, the knowledge base summary will classify them into multiple categories.

Phinease · web-flow · commit 6053855fe65d · 2025-11-06T11:33:42.000+08:00
diff --git a/backend/utils/document_vector_utils.py b/backend/utils/document_vector_utils.py
@@ -17,6 +17,7 @@
 from jinja2 import Template, StrictUndefined
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
+from sklearn.metrics.pairwise import cosine_similarity
 
 from consts.const import LANGUAGE
 
@@ -77,7 +78,15 @@ def get_documents_from_es(index_name: str, es_core, sample_doc_count: int = 200)
                 "query": {
                     "term": {"path_or_url": path_or_url}
                 },
-                "size": chunk_count  # Get all chunks
+                "size": chunk_count,  # Get all chunks
+                "sort": [
+                    {
+                        "create_time": {
+                            "order": "asc",
+                            "missing": "_last"  # Put documents without create_time at the end
+                        }
+                    }
+                ]
             }
             
             chunks_response = es_core.client.search(index=index_name, body=chunks_query)
@@ -124,10 +133,9 @@ def calculate_document_embedding(doc_chunks: List[Dict], use_weighted: bool = Tr
                 embeddings.append(np.array(chunk_embedding))
                 
                 if use_weighted:
-                    # Weight by content length
+                    # Weight by content length only (removed position-based weight to reduce order dependency)
                     content_length = len(chunk.get('content', ''))
-                    position_weight = 1.5 if len(embeddings) == 1 else 1.0  # First chunk has higher weight
-                    weight = position_weight * content_length
+                    weight = content_length
                     weights.append(weight)
         
         if not embeddings:
@@ -217,6 +225,162 @@ def auto_determine_k(embeddings: np.ndarray, min_k: int = 3, max_k: int = 15) ->
         return heuristic_k
 
 
+def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_embeddings: Dict[str, np.ndarray], similarity_threshold: float = 0.98) -> Dict[int, List[str]]:
+    """
+    Post-process clusters to merge duplicate documents (same content but different path_or_url)
+    that were incorrectly split into different clusters.
+    
+    Args:
+        clusters: Dictionary mapping cluster IDs to lists of document IDs
+        doc_embeddings: Dictionary mapping document IDs to their embeddings
+        similarity_threshold: Cosine similarity threshold to consider documents as duplicates (default: 0.98)
+        
+    Returns:
+        Updated clusters dictionary with duplicate documents merged
+    """
+    try:
+        if not clusters or not doc_embeddings:
+            return clusters
+        
+        # Skip merging if there's only one cluster (nothing to merge)
+        if len(clusters) <= 1:
+            return clusters
+        
+        # Build a mapping from doc_id to its current cluster
+        doc_to_cluster = {}
+        for cluster_id, doc_ids in clusters.items():
+            for doc_id in doc_ids:
+                doc_to_cluster[doc_id] = cluster_id
+        
+        # Find duplicate pairs with high similarity
+        doc_ids_list = list(doc_embeddings.keys())
+        merged_pairs = []
+        
+        for i, doc_id1 in enumerate(doc_ids_list):
+            if doc_id1 not in doc_embeddings:
+                continue
+            
+            embedding1 = doc_embeddings[doc_id1]
+            
+            for j, doc_id2 in enumerate(doc_ids_list[i+1:], start=i+1):
+                if doc_id2 not in doc_embeddings:
+                    continue
+                
+                embedding2 = doc_embeddings[doc_id2]
+                
+                # Calculate cosine similarity
+                similarity = cosine_similarity(
+                    embedding1.reshape(1, -1),
+                    embedding2.reshape(1, -1)
+                )[0][0]
+                
+                # If similarity is very high, they are likely duplicates
+                if similarity >= similarity_threshold:
+                    cluster1 = doc_to_cluster.get(doc_id1)
+                    cluster2 = doc_to_cluster.get(doc_id2)
+                    
+                    # Only merge if they are in different clusters AND truly duplicates
+                    # Check both cosine similarity AND Euclidean distance to prevent false positives
+                    if cluster1 is not None and cluster2 is not None and cluster1 != cluster2:
+                        # Calculate Euclidean distance to ensure they're truly duplicates
+                        # Documents that are just in the same direction but far apart should not be merged
+                        euclidean_distance = np.linalg.norm(embedding1 - embedding2)
+                        
+                        # Normalize embeddings to get their magnitudes
+                        norm1 = np.linalg.norm(embedding1)
+                        norm2 = np.linalg.norm(embedding2)
+                        avg_norm = (norm1 + norm2) / 2.0
+                        
+                        # Relative distance threshold: if distance is less than 1% of average magnitude,
+                        # they are likely true duplicates (same content, different path_or_url)
+                        # This prevents merging documents that are just in similar directions
+                        relative_distance_threshold = 0.01 * avg_norm if avg_norm > 0 else 0.1
+                        
+                        if euclidean_distance <= relative_distance_threshold:
+                            merged_pairs.append((doc_id1, doc_id2, cluster1, cluster2, similarity))
+                            logger.info(f"Found duplicate documents: {doc_id1} and {doc_id2} (similarity: {similarity:.4f}, distance: {euclidean_distance:.4f}) in different clusters {cluster1} and {cluster2}")
+        
+        # Merge duplicate documents into the same cluster
+        if merged_pairs:
+            logger.info(f"Merging {len(merged_pairs)} pairs of duplicate documents")
+            
+            # Build a graph of duplicate relationships using union-find
+            parent = {}
+            
+            def find(x):
+                if x not in parent:
+                    parent[x] = x
+                if parent[x] != x:
+                    parent[x] = find(parent[x])
+                return parent[x]
+            
+            def union(x, y):
+                px, py = find(x), find(y)
+                if px != py:
+                    parent[px] = py
+            
+            # Build union-find structure
+            for doc_id1, doc_id2, _, _, _ in merged_pairs:
+                union(doc_id1, doc_id2)
+            
+            # Group documents by their root parent
+            # Only include documents that are part of duplicate pairs
+            duplicate_doc_ids = set()
+            for doc_id1, doc_id2, _, _, _ in merged_pairs:
+                duplicate_doc_ids.add(doc_id1)
+                duplicate_doc_ids.add(doc_id2)
+            
+            groups = {}
+            for doc_id in duplicate_doc_ids:
+                root = find(doc_id)
+                if root not in groups:
+                    groups[root] = []
+                groups[root].append(doc_id)
+            
+            # Merge each group into the same cluster
+            for root, doc_group in groups.items():
+                if len(doc_group) < 2:
+                    continue
+                
+                # Find all clusters containing documents in this group
+                clusters_in_group = set()
+                for doc_id in doc_group:
+                    if doc_id in doc_to_cluster:
+                        clusters_in_group.add(doc_to_cluster[doc_id])
+                
+                if len(clusters_in_group) > 1:
+                    # Merge all documents to the smallest cluster ID
+                    target_cluster = min(clusters_in_group)
+                    
+                    for doc_id in doc_group:
+                        current_cluster = doc_to_cluster.get(doc_id)
+                        if current_cluster is not None and current_cluster != target_cluster:
+                            # Move document to target cluster
+                            if current_cluster in clusters and doc_id in clusters[current_cluster]:
+                                clusters[current_cluster].remove(doc_id)
+                            if target_cluster not in clusters:
+                                clusters[target_cluster] = []
+                            if doc_id not in clusters[target_cluster]:
+                                clusters[target_cluster].append(doc_id)
+                            doc_to_cluster[doc_id] = target_cluster
+                            logger.debug(f"Moved {doc_id} from cluster {current_cluster} to cluster {target_cluster}")
+            
+            # Remove empty clusters
+            empty_clusters = [cid for cid, docs in clusters.items() if not docs]
+            for cid in empty_clusters:
+                del clusters[cid]
+                logger.debug(f"Removed empty cluster {cid}")
+            
+            logger.info(f"Successfully merged duplicate documents. Final cluster count: {len(clusters)}")
+        
+        return clusters
+        
+    except Exception as e:
+        logger.error(f"Error merging duplicate documents: {str(e)}", exc_info=True)
+        # Return original clusters if merge fails
+        return clusters
+
+
 def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[int] = None) -> Dict[int, List[str]]:
     """
     Cluster documents using K-means
@@ -266,6 +430,13 @@ def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[
         for cluster_id, docs in clusters.items():
             logger.info(f"Cluster {cluster_id}: {len(docs)} documents")
         
+        # Post-process: merge duplicate documents that were split into different clusters
+        clusters = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
+        
+        # Log final cluster sizes after merge
+        for cluster_id, docs in clusters.items():
+            logger.info(f"Final cluster {cluster_id}: {len(docs)} documents")
+        
         return clusters
         
     except Exception as e:
diff --git a/test/backend/test_document_vector_utils.py b/test/backend/test_document_vector_utils.py
@@ -27,7 +27,8 @@
     get_documents_from_es,
     process_documents_for_clustering,
     extract_cluster_content,
-    analyze_cluster_coherence
+    analyze_cluster_coherence,
+    merge_duplicate_documents_in_clusters
 )
 
 
@@ -48,7 +49,7 @@ def test_calculate_document_embedding_simple_average(self):
         assert np.allclose(result, [4.0, 5.0, 6.0])  # Average of all embeddings
     
     def test_calculate_document_embedding_weighted(self):
-        """Test weighted average embedding calculation"""
+        """Test weighted average embedding calculation (no position weight)"""
         chunks = [
             {'embedding': [1.0, 2.0], 'content': 'Short'},
             {'embedding': [3.0, 4.0], 'content': 'Long content with more words'},
@@ -59,6 +60,9 @@ def test_calculate_document_embedding_weighted(self):
         
         assert result is not None
         assert len(result) == 2
+        # Weight should be based on content length only, not position
+        # First chunk should NOT have extra 1.5x weight
+        # Result should be weighted average where longer chunks have more weight
     
     def test_calculate_document_embedding_empty_chunks(self):
         """Test handling of empty chunks"""
@@ -345,7 +349,8 @@ def test_get_documents_from_es_mock(self):
                             'path_or_url': '/path/doc1.pdf',
                             'filename': 'doc1.pdf',
                             'content': 'Content 1',
-                            'embedding': [1.0, 2.0, 3.0]
+                            'embedding': [1.0, 2.0, 3.0],
+                            'create_time': '2024-01-01T00:00:00'
                         }
                     }
                 ]
@@ -370,6 +375,16 @@ def test_get_documents_from_es_mock(self):
         # Check that we have document data
         first_doc = list(result.values())[0]
         assert 'chunks' in first_doc
+        
+        # Verify that sort parameter is included in the query
+        call_args = mock_es_core.client.search.call_args
+        if call_args:
+            query_body = call_args[1].get('body') or call_args[0][1] if len(call_args[0]) > 1 else None
+            if query_body and 'sort' in query_body:
+                sort_config = query_body['sort']
+                assert isinstance(sort_config, list)
+                # Should have create_time sort
+                assert any('create_time' in str(sort_item) for sort_item in sort_config)
 
 
 class TestProcessDocumentsForClustering:
@@ -465,6 +480,73 @@ def test_analyze_cluster_coherence(self):
         assert result['doc_count'] == 2
 
 
+class TestMergeDuplicateDocumentsInClusters:
+    """Test duplicate document merging in clusters"""
+    
+    def test_merge_duplicate_documents_same_cluster(self):
+        """Test that documents in same cluster are not merged"""
+        clusters = {
+            0: ['doc1', 'doc2'],
+            1: ['doc3']
+        }
+        doc_embeddings = {
+            'doc1': np.array([1.0, 0.0]),
+            'doc2': np.array([0.9, 0.1]),
+            'doc3': np.array([0.0, 1.0])
+        }
+        
+        result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
+        
+        # Documents with similarity < 0.98 should not be merged
+        assert len(result) == 2
+        assert 0 in result
+        assert 1 in result
+    
+    def test_merge_duplicate_documents_different_clusters(self):
+        """Test that highly similar documents in different clusters are merged"""
+        clusters = {
+            0: ['doc1'],
+            1: ['doc2']
+        }
+        # Create two identical embeddings (duplicate documents)
+        identical_embedding = np.array([1.0, 0.0, 0.0])
+        doc_embeddings = {
+            'doc1': identical_embedding,
+            'doc2': identical_embedding.copy()  # Same embedding
+        }
+        
+        result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
+        
+        # Documents with similarity >= 0.98 should be merged into same cluster
+        # Result should have fewer clusters
+        assert len(result) <= 2
+    
+    def test_merge_duplicate_documents_empty_clusters(self):
+        """Test handling of empty clusters"""
+        clusters = {}
+        doc_embeddings = {}
+        
+        result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings)
+        
+        assert result == {}
+    
+    def test_merge_duplicate_documents_error_handling(self):
+        """Test error handling in merge function"""
+        clusters = {
+            0: ['doc1', 'doc2']
+        }
+        doc_embeddings = {
+            'doc1': np.array([1.0, 0.0]),
+            'doc2': np.array([0.9, 0.1])
+        }
+        
+        # Should not raise exception even with invalid similarity calculation
+        result = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=2.0)
+        
+        # Should return clusters (possibly unchanged due to high threshold)
+        assert isinstance(result, dict)
+
+
 if __name__ == '__main__':
     pytest.main([__file__, '-v'])
 
diff --git a/test/backend/test_document_vector_utils_coverage.py b/test/backend/test_document_vector_utils_coverage.py
@@ -29,7 +29,8 @@
     merge_cluster_summaries,
     calculate_document_embedding,
     auto_determine_k,
-    kmeans_cluster_documents
+    kmeans_cluster_documents,
+    merge_duplicate_documents_in_clusters
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,8 @@`
`29`	`29`	`merge_cluster_summaries,`
`30`	`30`	`calculate_document_embedding,`
`31`	`31`	`auto_determine_k,`
`32`		`- kmeans_cluster_documents`
	`32`	`+ kmeans_cluster_documents,`
	`33`	`+ merge_duplicate_documents_in_clusters`
`33`	`34`	`)`
`34`	`35`
`35`	`36`