Skip to content

Commit f5ea95e

Browse files
committed
test: Add comprehensive coverage tests for document_vector_utils uncovered code paths
1 parent f5bf531 commit f5ea95e

File tree

1 file changed

+342
-1
lines changed

1 file changed

+342
-1
lines changed

test/backend/test_document_vector_utils_coverage.py

Lines changed: 342 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@
2626
summarize_cluster_legacy,
2727
summarize_clusters_map_reduce,
2828
summarize_clusters,
29-
merge_cluster_summaries
29+
merge_cluster_summaries,
30+
calculate_document_embedding,
31+
auto_determine_k,
32+
kmeans_cluster_documents
3033
)
3134

3235

@@ -308,3 +311,341 @@ def test_merge_cluster_summaries_single(self):
308311
assert isinstance(result, str)
309312
assert "Single cluster summary" in result
310313

314+
315+
class TestAdditionalCoverage:
316+
"""Test additional coverage for uncovered code paths"""
317+
318+
def test_get_documents_from_es_non_list_documents(self):
319+
"""Test ES retrieval when all_documents is not a list"""
320+
mock_es_core = MagicMock()
321+
322+
# Mock the first search call to return a tuple instead of list
323+
mock_es_core.client.search.side_effect = [
324+
{
325+
'aggregations': {
326+
'unique_documents': {
327+
'buckets': ( # This will trigger the isinstance check
328+
{'key': '/path/doc1.pdf', 'doc_count': 3},
329+
)
330+
}
331+
}
332+
},
333+
{
334+
'hits': {
335+
'hits': [
336+
{
337+
'_source': {
338+
'filename': 'doc1.pdf',
339+
'content': 'test content',
340+
'embedding': [0.1, 0.2, 0.3],
341+
'file_size': 1000
342+
}
343+
}
344+
]
345+
}
346+
}
347+
]
348+
349+
result = get_documents_from_es('test_index', mock_es_core)
350+
assert isinstance(result, dict)
351+
352+
def test_get_documents_from_es_no_chunks(self):
353+
"""Test ES retrieval when document has no chunks"""
354+
mock_es_core = MagicMock()
355+
mock_es_core.client.search.side_effect = [
356+
{
357+
'aggregations': {
358+
'unique_documents': {
359+
'buckets': [
360+
{'key': '/path/doc1.pdf', 'doc_count': 0}
361+
]
362+
}
363+
}
364+
},
365+
{
366+
'hits': {
367+
'hits': [] # No chunks
368+
}
369+
}
370+
]
371+
372+
result = get_documents_from_es('test_index', mock_es_core)
373+
assert result == {} # Should return empty dict when no chunks
374+
375+
def test_calculate_document_embedding_exception(self):
376+
"""Test calculate_document_embedding with exception"""
377+
chunks = [
378+
{'content': 'test content', 'embedding': [0.1, 0.2, 0.3]}
379+
]
380+
381+
# Mock numpy operations to raise exception
382+
with patch('numpy.array') as mock_array:
383+
mock_array.side_effect = Exception("Numpy error")
384+
385+
result = calculate_document_embedding(chunks)
386+
assert result is None
387+
388+
def test_auto_determine_k_small_dataset(self):
389+
"""Test auto_determine_k with very small dataset"""
390+
# Create embeddings with only 2 samples (less than min_k=3)
391+
embeddings = np.array([[0.1, 0.2], [0.3, 0.4]])
392+
393+
result = auto_determine_k(embeddings, min_k=3, max_k=5)
394+
assert result == 2 # Should return max(2, n_samples)
395+
396+
def test_auto_determine_k_exception(self):
397+
"""Test auto_determine_k with exception during calculation"""
398+
embeddings = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
399+
400+
# Mock silhouette_score to raise exception
401+
with patch('sklearn.metrics.silhouette_score') as mock_silhouette:
402+
mock_silhouette.side_effect = Exception("Silhouette error")
403+
404+
result = auto_determine_k(embeddings, min_k=2, max_k=3)
405+
# Should use heuristic fallback
406+
assert isinstance(result, int)
407+
assert result >= 2
408+
409+
def test_kmeans_cluster_documents_empty(self):
410+
"""Test kmeans_cluster_documents with empty embeddings"""
411+
result = kmeans_cluster_documents({})
412+
assert result == {}
413+
414+
def test_kmeans_cluster_documents_exception(self):
415+
"""Test kmeans_cluster_documents with exception"""
416+
doc_embeddings = {
417+
'doc1': np.array([0.1, 0.2, 0.3]),
418+
'doc2': np.array([0.4, 0.5, 0.6])
419+
}
420+
421+
# Mock auto_determine_k to raise exception
422+
with patch('backend.utils.document_vector_utils.auto_determine_k') as mock_auto_k:
423+
mock_auto_k.side_effect = Exception("Auto K error")
424+
425+
with pytest.raises(Exception, match="Failed to cluster documents"):
426+
kmeans_cluster_documents(doc_embeddings)
427+
428+
def test_process_documents_for_clustering_exception(self):
429+
"""Test process_documents_for_clustering with exception"""
430+
mock_es_core = MagicMock()
431+
mock_es_core.client.search.side_effect = Exception("ES error")
432+
433+
with pytest.raises(Exception, match="Failed to process documents"):
434+
process_documents_for_clustering('test_index', mock_es_core)
435+
436+
def test_process_documents_for_clustering_no_embeddings(self):
437+
"""Test process_documents_for_clustering when some documents fail embedding calculation"""
438+
mock_es_core = MagicMock()
439+
mock_es_core.client.search.return_value = {
440+
'aggregations': {
441+
'unique_documents': {
442+
'buckets': [
443+
{'key': '/path/doc1.pdf', 'doc_count': 1}
444+
]
445+
}
446+
},
447+
'hits': {
448+
'hits': [
449+
{
450+
'_source': {
451+
'filename': 'doc1.pdf',
452+
'content': 'test content',
453+
'embedding': [0.1, 0.2, 0.3],
454+
'file_size': 1000
455+
}
456+
}
457+
]
458+
}
459+
}
460+
461+
# Mock calculate_document_embedding to return None
462+
with patch('backend.utils.document_vector_utils.calculate_document_embedding') as mock_calc:
463+
mock_calc.return_value = None
464+
465+
docs, embeddings = process_documents_for_clustering('test_index', mock_es_core)
466+
assert isinstance(docs, dict)
467+
assert isinstance(embeddings, dict)
468+
assert len(embeddings) == 0 # No successful embeddings
469+
470+
def test_extract_cluster_content_missing_doc(self):
471+
"""Test extract_cluster_content with missing document"""
472+
document_samples = {
473+
'doc1': {
474+
'chunks': [{'content': 'test content'}]
475+
}
476+
}
477+
cluster_doc_ids = ['doc1', 'missing_doc']
478+
479+
result = extract_cluster_content(document_samples, cluster_doc_ids)
480+
assert isinstance(result, str)
481+
assert 'test content' in result
482+
483+
def test_extract_cluster_content_no_chunks(self):
484+
"""Test extract_cluster_content with document having no chunks"""
485+
document_samples = {
486+
'doc1': {
487+
'chunks': []
488+
}
489+
}
490+
cluster_doc_ids = ['doc1']
491+
492+
result = extract_cluster_content(document_samples, cluster_doc_ids)
493+
assert isinstance(result, str)
494+
495+
def test_extract_representative_chunks_smart_import_error(self):
496+
"""Test extract_representative_chunks_smart with ImportError"""
497+
chunks = [
498+
{'content': 'chunk 1'},
499+
{'content': 'chunk 2'},
500+
{'content': 'chunk 3'}
501+
]
502+
503+
# Mock the import to raise ImportError
504+
with patch('builtins.__import__', side_effect=ImportError("Module not found")):
505+
result = extract_representative_chunks_smart(chunks, max_chunks=2)
506+
assert len(result) <= 2
507+
assert len(result) > 0
508+
509+
def test_extract_representative_chunks_smart_short_content(self):
510+
"""Test extract_representative_chunks_smart with short content"""
511+
chunks = [
512+
{'content': 'short'},
513+
{'content': 'also short'},
514+
{'content': 'very short content'}
515+
]
516+
517+
result = extract_representative_chunks_smart(chunks, max_chunks=2)
518+
assert len(result) <= 2
519+
assert len(result) > 0
520+
521+
def test_analyze_cluster_coherence_empty(self):
522+
"""Test analyze_cluster_coherence with empty cluster_doc_ids"""
523+
document_samples = {
524+
'doc1': {
525+
'chunks': [{'content': 'test content'}]
526+
}
527+
}
528+
cluster_doc_ids = []
529+
530+
result = analyze_cluster_coherence(cluster_doc_ids, document_samples)
531+
assert result == {}
532+
533+
def test_analyze_cluster_coherence_missing_doc(self):
534+
"""Test analyze_cluster_coherence with missing document"""
535+
document_samples = {
536+
'doc1': {
537+
'chunks': [{'content': 'test content'}]
538+
}
539+
}
540+
cluster_doc_ids = ['doc1', 'missing_doc']
541+
542+
result = analyze_cluster_coherence(cluster_doc_ids, document_samples)
543+
assert isinstance(result, dict)
544+
545+
def test_analyze_cluster_coherence_no_chunks(self):
546+
"""Test analyze_cluster_coherence with document having no chunks"""
547+
document_samples = {
548+
'doc1': {
549+
'chunks': []
550+
}
551+
}
552+
cluster_doc_ids = ['doc1']
553+
554+
result = analyze_cluster_coherence(cluster_doc_ids, document_samples)
555+
assert isinstance(result, dict)
556+
557+
def test_summarize_clusters_map_reduce_missing_doc(self):
558+
"""Test summarize_clusters_map_reduce with missing document"""
559+
document_samples = {
560+
'doc1': {
561+
'chunks': [{'content': 'test content'}],
562+
'filename': 'test.pdf'
563+
}
564+
}
565+
clusters = {0: ['doc1', 'missing_doc']}
566+
567+
with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc:
568+
mock_sum_doc.return_value = "Doc summary"
569+
570+
with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster:
571+
mock_sum_cluster.return_value = "Cluster summary"
572+
573+
result = summarize_clusters_map_reduce(document_samples, clusters)
574+
assert isinstance(result, dict)
575+
assert 0 in result
576+
577+
def test_summarize_clusters_map_reduce_few_chunks(self):
578+
"""Test summarize_clusters_map_reduce with document having few chunks"""
579+
document_samples = {
580+
'doc1': {
581+
'chunks': [
582+
{'content': 'chunk 1'},
583+
{'content': 'chunk 2'}
584+
],
585+
'filename': 'test.pdf'
586+
}
587+
}
588+
clusters = {0: ['doc1']}
589+
590+
with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc:
591+
mock_sum_doc.return_value = "Doc summary"
592+
593+
with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster:
594+
mock_sum_cluster.return_value = "Cluster summary"
595+
596+
result = summarize_clusters_map_reduce(document_samples, clusters)
597+
assert isinstance(result, dict)
598+
assert 0 in result
599+
600+
def test_summarize_clusters_map_reduce_long_content(self):
601+
"""Test summarize_clusters_map_reduce with long content"""
602+
long_content = 'x' * 1500 # Longer than 1000 chars
603+
document_samples = {
604+
'doc1': {
605+
'chunks': [
606+
{'content': long_content}
607+
],
608+
'filename': 'test.pdf'
609+
}
610+
}
611+
clusters = {0: ['doc1']}
612+
613+
with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc:
614+
mock_sum_doc.return_value = "Doc summary"
615+
616+
with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster:
617+
mock_sum_cluster.return_value = "Cluster summary"
618+
619+
result = summarize_clusters_map_reduce(document_samples, clusters)
620+
assert isinstance(result, dict)
621+
assert 0 in result
622+
623+
def test_summarize_clusters_map_reduce_no_valid_docs(self):
624+
"""Test summarize_clusters_map_reduce with no valid document summaries"""
625+
document_samples = {
626+
'doc1': {
627+
'chunks': [{'content': 'test content'}],
628+
'filename': 'test.pdf'
629+
}
630+
}
631+
clusters = {0: ['doc1']}
632+
633+
with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc:
634+
mock_sum_doc.return_value = "" # Empty summary
635+
636+
with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster:
637+
mock_sum_cluster.return_value = "Cluster summary"
638+
639+
result = summarize_clusters_map_reduce(document_samples, clusters)
640+
assert isinstance(result, dict)
641+
assert 0 in result
642+
643+
def test_summarize_cluster_legacy_exception(self):
644+
"""Test summarize_cluster_legacy with exception"""
645+
cluster_content = "Test cluster content"
646+
647+
# Mock file operations to raise exception
648+
with patch('builtins.open', side_effect=Exception("File error")):
649+
result = summarize_cluster_legacy(cluster_content)
650+
assert "Failed to generate summary" in result
651+

0 commit comments

Comments
 (0)