ModelEngine-Group
diff --git a/‎backend/utils/document_vector_utils.py‎
Lines changed: 175 additions & 4 deletions b/‎backend/utils/document_vector_utils.py‎
Lines changed: 175 additions & 4 deletions
diff --git a/‎doc/docs/zh/opensource-memorial-wall.md‎
Lines changed: 9 additions & 3 deletions b/‎doc/docs/zh/opensource-memorial-wall.md‎
Lines changed: 9 additions & 3 deletions
@@ -17,6 +17,7 @@
 from jinja2 import Template, StrictUndefined
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
+from sklearn.metrics.pairwise import cosine_similarity
 
 from consts.const import LANGUAGE
 
@@ -77,7 +78,15 @@ def get_documents_from_es(index_name: str, es_core, sample_doc_count: int = 200)
                 "query": {
                     "term": {"path_or_url": path_or_url}
                 },
-                "size": chunk_count  # Get all chunks
+                "size": chunk_count,  # Get all chunks
+                "sort": [
+                    {
+                        "create_time": {
+                            "order": "asc",
+                            "missing": "_last"  # Put documents without create_time at the end
+                        }
+                    }
+                ]
             }
 
             chunks_response = es_core.client.search(index=index_name, body=chunks_query)
@@ -124,10 +133,9 @@ def calculate_document_embedding(doc_chunks: List[Dict], use_weighted: bool = Tr
                 embeddings.append(np.array(chunk_embedding))
 
                 if use_weighted:
-                    # Weight by content length
+                    # Weight by content length only (removed position-based weight to reduce order dependency)
                     content_length = len(chunk.get('content', ''))
-                    position_weight = 1.5 if len(embeddings) == 1 else 1.0  # First chunk has higher weight
-                    weight = position_weight * content_length
+                    weight = content_length
                     weights.append(weight)
 
         if not embeddings:
@@ -217,6 +225,162 @@ def auto_determine_k(embeddings: np.ndarray, min_k: int = 3, max_k: int = 15) ->
         return heuristic_k
 
 
+def merge_duplicate_documents_in_clusters(clusters: Dict[int, List[str]], doc_embeddings: Dict[str, np.ndarray], similarity_threshold: float = 0.98) -> Dict[int, List[str]]:
+    """
+    Post-process clusters to merge duplicate documents (same content but different path_or_url)
+    that were incorrectly split into different clusters.
+    
+    Args:
+        clusters: Dictionary mapping cluster IDs to lists of document IDs
+        doc_embeddings: Dictionary mapping document IDs to their embeddings
+        similarity_threshold: Cosine similarity threshold to consider documents as duplicates (default: 0.98)
+        
+    Returns:
+        Updated clusters dictionary with duplicate documents merged
+    """
+    try:
+        if not clusters or not doc_embeddings:
+            return clusters
+        
+        # Skip merging if there's only one cluster (nothing to merge)
+        if len(clusters) <= 1:
+            return clusters
+        
+        # Build a mapping from doc_id to its current cluster
+        doc_to_cluster = {}
+        for cluster_id, doc_ids in clusters.items():
+            for doc_id in doc_ids:
+                doc_to_cluster[doc_id] = cluster_id
+        
+        # Find duplicate pairs with high similarity
+        doc_ids_list = list(doc_embeddings.keys())
+        merged_pairs = []
+        
+        for i, doc_id1 in enumerate(doc_ids_list):
+            if doc_id1 not in doc_embeddings:
+                continue
+            
+            embedding1 = doc_embeddings[doc_id1]
+            
+            for j, doc_id2 in enumerate(doc_ids_list[i+1:], start=i+1):
+                if doc_id2 not in doc_embeddings:
+                    continue
+                
+                embedding2 = doc_embeddings[doc_id2]
+                
+                # Calculate cosine similarity
+                similarity = cosine_similarity(
+                    embedding1.reshape(1, -1),
+                    embedding2.reshape(1, -1)
+                )[0][0]
+                
+                # If similarity is very high, they are likely duplicates
+                if similarity >= similarity_threshold:
+                    cluster1 = doc_to_cluster.get(doc_id1)
+                    cluster2 = doc_to_cluster.get(doc_id2)
+                    
+                    # Only merge if they are in different clusters AND truly duplicates
+                    # Check both cosine similarity AND Euclidean distance to prevent false positives
+                    if cluster1 is not None and cluster2 is not None and cluster1 != cluster2:
+                        # Calculate Euclidean distance to ensure they're truly duplicates
+                        # Documents that are just in the same direction but far apart should not be merged
+                        euclidean_distance = np.linalg.norm(embedding1 - embedding2)
+                        
+                        # Normalize embeddings to get their magnitudes
+                        norm1 = np.linalg.norm(embedding1)
+                        norm2 = np.linalg.norm(embedding2)
+                        avg_norm = (norm1 + norm2) / 2.0
+                        
+                        # Relative distance threshold: if distance is less than 1% of average magnitude,
+                        # they are likely true duplicates (same content, different path_or_url)
+                        # This prevents merging documents that are just in similar directions
+                        relative_distance_threshold = 0.01 * avg_norm if avg_norm > 0 else 0.1
+                        
+                        if euclidean_distance <= relative_distance_threshold:
+                            merged_pairs.append((doc_id1, doc_id2, cluster1, cluster2, similarity))
+                            logger.info(f"Found duplicate documents: {doc_id1} and {doc_id2} (similarity: {similarity:.4f}, distance: {euclidean_distance:.4f}) in different clusters {cluster1} and {cluster2}")
+        
+        # Merge duplicate documents into the same cluster
+        if merged_pairs:
+            logger.info(f"Merging {len(merged_pairs)} pairs of duplicate documents")
+            
+            # Build a graph of duplicate relationships using union-find
+            parent = {}
+            
+            def find(x):
+                if x not in parent:
+                    parent[x] = x
+                if parent[x] != x:
+                    parent[x] = find(parent[x])
+                return parent[x]
+            
+            def union(x, y):
+                px, py = find(x), find(y)
+                if px != py:
+                    parent[px] = py
+            
+            # Build union-find structure
+            for doc_id1, doc_id2, _, _, _ in merged_pairs:
+                union(doc_id1, doc_id2)
+            
+            # Group documents by their root parent
+            # Only include documents that are part of duplicate pairs
+            duplicate_doc_ids = set()
+            for doc_id1, doc_id2, _, _, _ in merged_pairs:
+                duplicate_doc_ids.add(doc_id1)
+                duplicate_doc_ids.add(doc_id2)
+            
+            groups = {}
+            for doc_id in duplicate_doc_ids:
+                root = find(doc_id)
+                if root not in groups:
+                    groups[root] = []
+                groups[root].append(doc_id)
+            
+            # Merge each group into the same cluster
+            for root, doc_group in groups.items():
+                if len(doc_group) < 2:
+                    continue
+                
+                # Find all clusters containing documents in this group
+                clusters_in_group = set()
+                for doc_id in doc_group:
+                    if doc_id in doc_to_cluster:
+                        clusters_in_group.add(doc_to_cluster[doc_id])
+                
+                if len(clusters_in_group) > 1:
+                    # Merge all documents to the smallest cluster ID
+                    target_cluster = min(clusters_in_group)
+                    
+                    for doc_id in doc_group:
+                        current_cluster = doc_to_cluster.get(doc_id)
+                        if current_cluster is not None and current_cluster != target_cluster:
+                            # Move document to target cluster
+                            if current_cluster in clusters and doc_id in clusters[current_cluster]:
+                                clusters[current_cluster].remove(doc_id)
+                            if target_cluster not in clusters:
+                                clusters[target_cluster] = []
+                            if doc_id not in clusters[target_cluster]:
+                                clusters[target_cluster].append(doc_id)
+                            doc_to_cluster[doc_id] = target_cluster
+                            logger.debug(f"Moved {doc_id} from cluster {current_cluster} to cluster {target_cluster}")
+            
+            # Remove empty clusters
+            empty_clusters = [cid for cid, docs in clusters.items() if not docs]
+            for cid in empty_clusters:
+                del clusters[cid]
+                logger.debug(f"Removed empty cluster {cid}")
+            
+            logger.info(f"Successfully merged duplicate documents. Final cluster count: {len(clusters)}")
+        
+        return clusters
+        
+    except Exception as e:
+        logger.error(f"Error merging duplicate documents: {str(e)}", exc_info=True)
+        # Return original clusters if merge fails
+        return clusters
+
+
 def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[int] = None) -> Dict[int, List[str]]:
     """
     Cluster documents using K-means
@@ -266,6 +430,13 @@ def kmeans_cluster_documents(doc_embeddings: Dict[str, np.ndarray], k: Optional[
         for cluster_id, docs in clusters.items():
             logger.info(f"Cluster {cluster_id}: {len(docs)} documents")
 
+        # Post-process: merge duplicate documents that were split into different clusters
+        clusters = merge_duplicate_documents_in_clusters(clusters, doc_embeddings, similarity_threshold=0.98)
+        
+        # Log final cluster sizes after merge
+        for cluster_id, docs in clusters.items():
+            logger.info(f"Final cluster {cluster_id}: {len(docs)} documents")
+        
         return clusters
 
     except Exception as e:
 
@@ -204,7 +204,6 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 
 ::: Pharaoh-C - 2025-11-2
 研究多智能体协作方向，Nexent 的多智能体协同功能让我眼前一亮，这在学术研究中太重要了。我用 Nexent 搭建了一个AI赛博医生，能够自动索引中西医文献、给一些症状做出解答。开源的力量真的很强大，希望更多研究者能加入进来！
-
 :::
 
 ::: info xUxIAOrUI -2025-10-28
@@ -214,6 +213,13 @@ Nexent功能如此之强大，给我很多帮助，感谢开发者！厉害
 ::: info niceman - 2025-10-29
 希望能参加ict大赛可以学习到更多知识，感谢 Nexent 让我踏上了开源之旅~
 :::
-:::
-希望ICT大赛能学到更多知识，感谢  Nexent平台
+
+::: info niceman - 2025-10-29
+感谢 Nexent 让我踏上了开源之旅!希望能参加ict大赛长长见识。项目不错，给个star~ 
+
+::：info XxHosxX - 2025-11-5
+希望参与ICT大赛以及Nexent平台提升自己的能力:)
+
+::: tip Zwer1 - 2025-11-04
+感谢 Nexent 让我踏上了开源之旅！平台开发智能体的能力十分强大，希望能够学习到更多东西！
 :::