Skip to content

Commit 438df82

Browse files
committed
git commit -m "feat: integrate knowledge base summarization service and optimize stability"
1 parent 1d2446c commit 438df82

File tree

3 files changed

+47
-39
lines changed

3 files changed

+47
-39
lines changed

backend/data_process/tasks.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,12 @@ def forward(
391391
cached, str) else str(type(cached))
392392
logger.error(
393393
f"[{self.request.id}] FORWARD TASK: JSON decode error for key '{redis_key}': {str(jde)}; raw_prefix={raw_preview!r}")
394+
# Try to clean up corrupted Redis data
395+
try:
396+
client.delete(redis_key)
397+
logger.info(f"[{self.request.id}] FORWARD TASK: Deleted corrupted Redis key '{redis_key}'")
398+
except Exception as cleanup_e:
399+
logger.warning(f"[{self.request.id}] FORWARD TASK: Failed to cleanup corrupted Redis key: {cleanup_e}")
394400
raise
395401
else:
396402
# No busy-wait: release the worker slot and retry later

sdk/nexent/vector_database/elasticsearch_core.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,13 @@ def __init__(
7070
self._settings_lock = threading.Lock()
7171
self._operation_counter = 0
7272

73-
# Embedding API limits
74-
self.max_texts_per_batch = 2048
73+
# Embedding API limits - Conservative settings for stability
74+
# Different APIs have different limits, so we use a conservative approach
75+
self.max_texts_per_batch = 50 # Very conservative for stability
7576
self.max_tokens_per_text = 8192
7677
self.max_total_tokens = 100000
77-
78+
self.max_retries = 3 # Number of retries for failed embedding batches
79+
7880
# ---- INDEX MANAGEMENT ----
7981

8082
def create_vector_index(self, index_name: str, embedding_dim: Optional[int] = None) -> bool:

test/backend/test_document_vector_utils.py

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def test_kmeans_cluster_documents_empty(self):
149149

150150
class TestExtractRepresentativeChunksSmart:
151151
"""Test smart chunk selection"""
152-
152+
153153
def test_extract_representative_chunks_smart_basic(self):
154154
"""Test basic smart chunk selection"""
155155
chunks = [
@@ -158,13 +158,13 @@ def test_extract_representative_chunks_smart_basic(self):
158158
{'content': 'Third chunk content'},
159159
{'content': 'Fourth chunk content'}
160160
]
161-
161+
162162
result = extract_representative_chunks_smart(chunks, max_chunks=3)
163-
163+
164164
assert len(result) <= 3
165165
assert result[0] == chunks[0] # First chunk always included
166166
assert result[-1] == chunks[-1] # Last chunk included
167-
167+
168168
def test_extract_representative_chunks_smart_import_error(self):
169169
"""Test fallback when calculate_term_weights import fails"""
170170
chunks = [
@@ -173,11 +173,11 @@ def test_extract_representative_chunks_smart_import_error(self):
173173
{'content': 'Third chunk content'},
174174
{'content': 'Fourth chunk content'}
175175
]
176-
176+
177177
# Mock the import to fail
178178
with patch.dict('sys.modules', {'nexent.core.nlp.tokenizer': None}):
179179
result = extract_representative_chunks_smart(chunks, max_chunks=3)
180-
180+
181181
# The fallback logic actually returns 3 chunks (first, middle, last)
182182
assert len(result) == 3
183183
assert result[0] == chunks[0] # First chunk
@@ -186,7 +186,7 @@ def test_extract_representative_chunks_smart_import_error(self):
186186

187187
class TestSummarizeDocument:
188188
"""Test document summarization"""
189-
189+
190190
def test_summarize_document_no_model(self):
191191
"""Test document summarization without model"""
192192
result = summarize_document(
@@ -197,7 +197,7 @@ def test_summarize_document_no_model(self):
197197
)
198198
assert isinstance(result, str)
199199
assert "test.pdf" in result
200-
200+
201201
def test_summarize_document_with_model_placeholder(self):
202202
"""Test document summarization with model ID but no actual LLM call"""
203203
result = summarize_document(
@@ -212,7 +212,7 @@ def test_summarize_document_with_model_placeholder(self):
212212

213213
class TestSummarizeCluster:
214214
"""Test cluster summarization"""
215-
215+
216216
def test_summarize_cluster_no_model(self):
217217
"""Test cluster summarization without model"""
218218
result = summarize_cluster(
@@ -222,7 +222,7 @@ def test_summarize_cluster_no_model(self):
222222
)
223223
assert isinstance(result, str)
224224
assert "Summary" in result
225-
225+
226226
def test_summarize_cluster_with_model_placeholder(self):
227227
"""Test cluster summarization with model ID but no actual LLM call"""
228228
result = summarize_cluster(
@@ -236,7 +236,7 @@ def test_summarize_cluster_with_model_placeholder(self):
236236

237237
class TestSummarizeClustersMapReduce:
238238
"""Test map-reduce cluster summarization"""
239-
239+
240240
def test_summarize_clusters_map_reduce_basic(self):
241241
"""Test basic map-reduce summarization"""
242242
document_samples = {
@@ -252,24 +252,24 @@ def test_summarize_clusters_map_reduce_basic(self):
252252
}
253253
}
254254
clusters = {0: ['doc1', 'doc2']}
255-
255+
256256
with patch('backend.utils.document_vector_utils.summarize_document') as mock_summarize_doc, \
257257
patch('backend.utils.document_vector_utils.summarize_cluster') as mock_summarize_cluster:
258-
258+
259259
mock_summarize_doc.return_value = "Document summary"
260260
mock_summarize_cluster.return_value = "Cluster summary"
261-
261+
262262
result = summarize_clusters_map_reduce(
263263
document_samples=document_samples,
264264
clusters=clusters,
265265
model_id=1,
266266
tenant_id="test_tenant"
267267
)
268-
268+
269269
assert isinstance(result, dict)
270270
assert 0 in result
271271
assert result[0] == "Cluster summary"
272-
272+
273273
def test_summarize_clusters_map_reduce_no_valid_documents(self):
274274
"""Test map-reduce when no valid documents in cluster"""
275275
document_samples = {
@@ -279,38 +279,38 @@ def test_summarize_clusters_map_reduce_no_valid_documents(self):
279279
}
280280
}
281281
clusters = {0: ['doc1']}
282-
282+
283283
with patch('backend.utils.document_vector_utils.summarize_document') as mock_summarize_doc, \
284284
patch('backend.utils.document_vector_utils.summarize_cluster') as mock_summarize_cluster:
285-
285+
286286
mock_summarize_doc.return_value = ""
287287
mock_summarize_cluster.return_value = "Mock cluster summary"
288-
288+
289289
result = summarize_clusters_map_reduce(
290290
document_samples=document_samples,
291291
clusters=clusters,
292292
model_id=1,
293293
tenant_id="test_tenant"
294294
)
295-
295+
296296
assert isinstance(result, dict)
297297
assert 0 in result
298298
assert result[0] == "Mock cluster summary"
299299

300300

301301
class TestMergeClusterSummaries:
302302
"""Test cluster summary merging"""
303-
303+
304304
def test_merge_cluster_summaries(self):
305305
"""Test merging multiple cluster summaries"""
306306
cluster_summaries = {
307307
0: "First cluster summary",
308308
1: "Second cluster summary",
309309
2: "Third cluster summary"
310310
}
311-
311+
312312
result = merge_cluster_summaries(cluster_summaries)
313-
313+
314314
assert isinstance(result, str)
315315
assert "First cluster summary" in result
316316
assert "Second cluster summary" in result
@@ -320,7 +320,7 @@ def test_merge_cluster_summaries(self):
320320

321321
class TestGetDocumentsFromEs:
322322
"""Test ES document retrieval"""
323-
323+
324324
def test_get_documents_from_es_mock(self):
325325
"""Test ES document retrieval with mocked client"""
326326
mock_es_core = MagicMock()
@@ -348,9 +348,9 @@ def test_get_documents_from_es_mock(self):
348348
}
349349
}
350350
}
351-
351+
352352
result = get_documents_from_es('test_index', mock_es_core, sample_doc_count=10)
353-
353+
354354
assert isinstance(result, dict)
355355
# The function returns a dict with document IDs as keys, not 'documents' key
356356
assert len(result) > 0
@@ -361,7 +361,7 @@ def test_get_documents_from_es_mock(self):
361361

362362
class TestProcessDocumentsForClustering:
363363
"""Test document processing for clustering"""
364-
364+
365365
def test_process_documents_for_clustering_mock(self):
366366
"""Test document processing with mocked functions"""
367367
mock_es_core = MagicMock()
@@ -389,22 +389,22 @@ def test_process_documents_for_clustering_mock(self):
389389
}
390390
}
391391
}
392-
392+
393393
with patch('backend.utils.document_vector_utils.calculate_document_embedding') as mock_calc_embedding:
394394
mock_calc_embedding.return_value = np.array([1.0, 2.0, 3.0])
395-
395+
396396
documents, embeddings = process_documents_for_clustering(
397397
'test_index', mock_es_core, sample_doc_count=10
398398
)
399-
399+
400400
assert isinstance(documents, dict)
401401
assert isinstance(embeddings, dict)
402402
assert len(documents) == len(embeddings)
403403

404404

405405
class TestExtractClusterContent:
406406
"""Test cluster content extraction"""
407-
407+
408408
def test_extract_cluster_content(self):
409409
"""Test extracting content from cluster documents"""
410410
document_samples = {
@@ -418,9 +418,9 @@ def test_extract_cluster_content(self):
418418
}
419419
}
420420
doc_ids = ['doc1', 'doc2']
421-
421+
422422
result = extract_cluster_content(document_samples, doc_ids)
423-
423+
424424
assert isinstance(result, str) # The function returns a formatted string
425425
assert 'Content 1' in result
426426
assert 'Content 2' in result
@@ -430,7 +430,7 @@ def test_extract_cluster_content(self):
430430

431431
class TestAnalyzeClusterCoherence:
432432
"""Test cluster coherence analysis"""
433-
433+
434434
def test_analyze_cluster_coherence(self):
435435
"""Test cluster coherence analysis"""
436436
document_samples = {
@@ -444,9 +444,9 @@ def test_analyze_cluster_coherence(self):
444444
}
445445
}
446446
doc_ids = ['doc1', 'doc2']
447-
447+
448448
result = analyze_cluster_coherence(doc_ids, document_samples)
449-
449+
450450
assert isinstance(result, dict)
451451
assert 'doc_count' in result
452452
assert result['doc_count'] == 2

0 commit comments

Comments
 (0)