git commit -m "feat: integrate knowledge base summarization service and optimize stability"

Mermaid97 · Mermaid97 · commit 438df82e39df · 2025-10-24T14:50:29.000+08:00
diff --git a/backend/data_process/tasks.py b/backend/data_process/tasks.py
@@ -391,6 +391,12 @@ def forward(
                             cached, str) else str(type(cached))
                         logger.error(
                             f"[{self.request.id}] FORWARD TASK: JSON decode error for key '{redis_key}': {str(jde)}; raw_prefix={raw_preview!r}")
+                        # Try to clean up corrupted Redis data
+                        try:
+                            client.delete(redis_key)
+                            logger.info(f"[{self.request.id}] FORWARD TASK: Deleted corrupted Redis key '{redis_key}'")
+                        except Exception as cleanup_e:
+                            logger.warning(f"[{self.request.id}] FORWARD TASK: Failed to cleanup corrupted Redis key: {cleanup_e}")
                         raise
                 else:
                     # No busy-wait: release the worker slot and retry later
diff --git a/sdk/nexent/vector_database/elasticsearch_core.py b/sdk/nexent/vector_database/elasticsearch_core.py
@@ -70,11 +70,13 @@ def __init__(
         self._settings_lock = threading.Lock()
         self._operation_counter = 0
 
-        # Embedding API limits
-        self.max_texts_per_batch = 2048
+        # Embedding API limits - Conservative settings for stability
+        # Different APIs have different limits, so we use a conservative approach
+        self.max_texts_per_batch = 50   # Very conservative for stability
         self.max_tokens_per_text = 8192
         self.max_total_tokens = 100000
-    
+        self.max_retries = 3  # Number of retries for failed embedding batches
+
     # ---- INDEX MANAGEMENT ----
     
     def create_vector_index(self, index_name: str, embedding_dim: Optional[int] = None) -> bool:
diff --git a/test/backend/test_document_vector_utils.py b/test/backend/test_document_vector_utils.py
@@ -149,7 +149,7 @@ def test_kmeans_cluster_documents_empty(self):
 
 class TestExtractRepresentativeChunksSmart:
     """Test smart chunk selection"""
-    
+
     def test_extract_representative_chunks_smart_basic(self):
         """Test basic smart chunk selection"""
         chunks = [
@@ -158,13 +158,13 @@ def test_extract_representative_chunks_smart_basic(self):
             {'content': 'Third chunk content'},
             {'content': 'Fourth chunk content'}
         ]
-        
+
         result = extract_representative_chunks_smart(chunks, max_chunks=3)
-        
+
         assert len(result) <= 3
         assert result[0] == chunks[0]  # First chunk always included
         assert result[-1] == chunks[-1]  # Last chunk included
-    
+
     def test_extract_representative_chunks_smart_import_error(self):
         """Test fallback when calculate_term_weights import fails"""
         chunks = [
@@ -173,11 +173,11 @@ def test_extract_representative_chunks_smart_import_error(self):
             {'content': 'Third chunk content'},
             {'content': 'Fourth chunk content'}
         ]
-        
+
         # Mock the import to fail
         with patch.dict('sys.modules', {'nexent.core.nlp.tokenizer': None}):
             result = extract_representative_chunks_smart(chunks, max_chunks=3)
-            
+
             # The fallback logic actually returns 3 chunks (first, middle, last)
             assert len(result) == 3
             assert result[0] == chunks[0]  # First chunk
@@ -186,7 +186,7 @@ def test_extract_representative_chunks_smart_import_error(self):
 
 class TestSummarizeDocument:
     """Test document summarization"""
-    
+
     def test_summarize_document_no_model(self):
         """Test document summarization without model"""
         result = summarize_document(
@@ -197,7 +197,7 @@ def test_summarize_document_no_model(self):
         )
         assert isinstance(result, str)
         assert "test.pdf" in result
-    
+
     def test_summarize_document_with_model_placeholder(self):
         """Test document summarization with model ID but no actual LLM call"""
         result = summarize_document(
@@ -212,7 +212,7 @@ def test_summarize_document_with_model_placeholder(self):
 
 class TestSummarizeCluster:
     """Test cluster summarization"""
-    
+
     def test_summarize_cluster_no_model(self):
         """Test cluster summarization without model"""
         result = summarize_cluster(
@@ -222,7 +222,7 @@ def test_summarize_cluster_no_model(self):
         )
         assert isinstance(result, str)
         assert "Summary" in result
-    
+
     def test_summarize_cluster_with_model_placeholder(self):
         """Test cluster summarization with model ID but no actual LLM call"""
         result = summarize_cluster(
@@ -236,7 +236,7 @@ def test_summarize_cluster_with_model_placeholder(self):
 
 class TestSummarizeClustersMapReduce:
     """Test map-reduce cluster summarization"""
-    
+
     def test_summarize_clusters_map_reduce_basic(self):
         """Test basic map-reduce summarization"""
         document_samples = {
@@ -252,24 +252,24 @@ def test_summarize_clusters_map_reduce_basic(self):
             }
         }
         clusters = {0: ['doc1', 'doc2']}
-        
+
         with patch('backend.utils.document_vector_utils.summarize_document') as mock_summarize_doc, \
              patch('backend.utils.document_vector_utils.summarize_cluster') as mock_summarize_cluster:
-            
+
             mock_summarize_doc.return_value = "Document summary"
             mock_summarize_cluster.return_value = "Cluster summary"
-            
+
             result = summarize_clusters_map_reduce(
                 document_samples=document_samples,
                 clusters=clusters,
                 model_id=1,
                 tenant_id="test_tenant"
             )
-            
+
             assert isinstance(result, dict)
             assert 0 in result
             assert result[0] == "Cluster summary"
-    
+
     def test_summarize_clusters_map_reduce_no_valid_documents(self):
         """Test map-reduce when no valid documents in cluster"""
         document_samples = {
@@ -279,38 +279,38 @@ def test_summarize_clusters_map_reduce_no_valid_documents(self):
             }
         }
         clusters = {0: ['doc1']}
-        
+
         with patch('backend.utils.document_vector_utils.summarize_document') as mock_summarize_doc, \
              patch('backend.utils.document_vector_utils.summarize_cluster') as mock_summarize_cluster:
-            
+
             mock_summarize_doc.return_value = ""
             mock_summarize_cluster.return_value = "Mock cluster summary"
-            
+
             result = summarize_clusters_map_reduce(
                 document_samples=document_samples,
                 clusters=clusters,
                 model_id=1,
                 tenant_id="test_tenant"
             )
-            
+
             assert isinstance(result, dict)
             assert 0 in result
             assert result[0] == "Mock cluster summary"
 
 
 class TestMergeClusterSummaries:
     """Test cluster summary merging"""
-    
+
     def test_merge_cluster_summaries(self):
         """Test merging multiple cluster summaries"""
         cluster_summaries = {
             0: "First cluster summary",
             1: "Second cluster summary",
             2: "Third cluster summary"
         }
-        
+
         result = merge_cluster_summaries(cluster_summaries)
-        
+
         assert isinstance(result, str)
         assert "First cluster summary" in result
         assert "Second cluster summary" in result
@@ -320,7 +320,7 @@ def test_merge_cluster_summaries(self):
 
 class TestGetDocumentsFromEs:
     """Test ES document retrieval"""
-    
+
     def test_get_documents_from_es_mock(self):
         """Test ES document retrieval with mocked client"""
         mock_es_core = MagicMock()
@@ -348,9 +348,9 @@ def test_get_documents_from_es_mock(self):
                 }
             }
         }
-        
+
         result = get_documents_from_es('test_index', mock_es_core, sample_doc_count=10)
-        
+
         assert isinstance(result, dict)
         # The function returns a dict with document IDs as keys, not 'documents' key
         assert len(result) > 0
@@ -361,7 +361,7 @@ def test_get_documents_from_es_mock(self):
 
 class TestProcessDocumentsForClustering:
     """Test document processing for clustering"""
-    
+
     def test_process_documents_for_clustering_mock(self):
         """Test document processing with mocked functions"""
         mock_es_core = MagicMock()
@@ -389,22 +389,22 @@ def test_process_documents_for_clustering_mock(self):
                 }
             }
         }
-        
+
         with patch('backend.utils.document_vector_utils.calculate_document_embedding') as mock_calc_embedding:
             mock_calc_embedding.return_value = np.array([1.0, 2.0, 3.0])
-            
+
             documents, embeddings = process_documents_for_clustering(
                 'test_index', mock_es_core, sample_doc_count=10
             )
-            
+
             assert isinstance(documents, dict)
             assert isinstance(embeddings, dict)
             assert len(documents) == len(embeddings)
 
 
 class TestExtractClusterContent:
     """Test cluster content extraction"""
-    
+
     def test_extract_cluster_content(self):
         """Test extracting content from cluster documents"""
         document_samples = {
@@ -418,9 +418,9 @@ def test_extract_cluster_content(self):
             }
         }
         doc_ids = ['doc1', 'doc2']
-        
+
         result = extract_cluster_content(document_samples, doc_ids)
-        
+
         assert isinstance(result, str)  # The function returns a formatted string
         assert 'Content 1' in result
         assert 'Content 2' in result
@@ -430,7 +430,7 @@ def test_extract_cluster_content(self):
 
 class TestAnalyzeClusterCoherence:
     """Test cluster coherence analysis"""
-    
+
     def test_analyze_cluster_coherence(self):
         """Test cluster coherence analysis"""
         document_samples = {
@@ -444,9 +444,9 @@ def test_analyze_cluster_coherence(self):
             }
         }
         doc_ids = ['doc1', 'doc2']
-        
+
         result = analyze_cluster_coherence(doc_ids, document_samples)
-        
+
         assert isinstance(result, dict)
         assert 'doc_count' in result
         assert result['doc_count'] == 2