|
26 | 26 | summarize_cluster_legacy, |
27 | 27 | summarize_clusters_map_reduce, |
28 | 28 | summarize_clusters, |
29 | | - merge_cluster_summaries |
| 29 | + merge_cluster_summaries, |
| 30 | + calculate_document_embedding, |
| 31 | + auto_determine_k, |
| 32 | + kmeans_cluster_documents |
30 | 33 | ) |
31 | 34 |
|
32 | 35 |
|
@@ -308,3 +311,341 @@ def test_merge_cluster_summaries_single(self): |
308 | 311 | assert isinstance(result, str) |
309 | 312 | assert "Single cluster summary" in result |
310 | 313 |
|
| 314 | + |
| 315 | +class TestAdditionalCoverage: |
| 316 | + """Test additional coverage for uncovered code paths""" |
| 317 | + |
| 318 | + def test_get_documents_from_es_non_list_documents(self): |
| 319 | + """Test ES retrieval when all_documents is not a list""" |
| 320 | + mock_es_core = MagicMock() |
| 321 | + |
| 322 | + # Mock the first search call to return a tuple instead of list |
| 323 | + mock_es_core.client.search.side_effect = [ |
| 324 | + { |
| 325 | + 'aggregations': { |
| 326 | + 'unique_documents': { |
| 327 | + 'buckets': ( # This will trigger the isinstance check |
| 328 | + {'key': '/path/doc1.pdf', 'doc_count': 3}, |
| 329 | + ) |
| 330 | + } |
| 331 | + } |
| 332 | + }, |
| 333 | + { |
| 334 | + 'hits': { |
| 335 | + 'hits': [ |
| 336 | + { |
| 337 | + '_source': { |
| 338 | + 'filename': 'doc1.pdf', |
| 339 | + 'content': 'test content', |
| 340 | + 'embedding': [0.1, 0.2, 0.3], |
| 341 | + 'file_size': 1000 |
| 342 | + } |
| 343 | + } |
| 344 | + ] |
| 345 | + } |
| 346 | + } |
| 347 | + ] |
| 348 | + |
| 349 | + result = get_documents_from_es('test_index', mock_es_core) |
| 350 | + assert isinstance(result, dict) |
| 351 | + |
| 352 | + def test_get_documents_from_es_no_chunks(self): |
| 353 | + """Test ES retrieval when document has no chunks""" |
| 354 | + mock_es_core = MagicMock() |
| 355 | + mock_es_core.client.search.side_effect = [ |
| 356 | + { |
| 357 | + 'aggregations': { |
| 358 | + 'unique_documents': { |
| 359 | + 'buckets': [ |
| 360 | + {'key': '/path/doc1.pdf', 'doc_count': 0} |
| 361 | + ] |
| 362 | + } |
| 363 | + } |
| 364 | + }, |
| 365 | + { |
| 366 | + 'hits': { |
| 367 | + 'hits': [] # No chunks |
| 368 | + } |
| 369 | + } |
| 370 | + ] |
| 371 | + |
| 372 | + result = get_documents_from_es('test_index', mock_es_core) |
| 373 | + assert result == {} # Should return empty dict when no chunks |
| 374 | + |
| 375 | + def test_calculate_document_embedding_exception(self): |
| 376 | + """Test calculate_document_embedding with exception""" |
| 377 | + chunks = [ |
| 378 | + {'content': 'test content', 'embedding': [0.1, 0.2, 0.3]} |
| 379 | + ] |
| 380 | + |
| 381 | + # Mock numpy operations to raise exception |
| 382 | + with patch('numpy.array') as mock_array: |
| 383 | + mock_array.side_effect = Exception("Numpy error") |
| 384 | + |
| 385 | + result = calculate_document_embedding(chunks) |
| 386 | + assert result is None |
| 387 | + |
| 388 | + def test_auto_determine_k_small_dataset(self): |
| 389 | + """Test auto_determine_k with very small dataset""" |
| 390 | + # Create embeddings with only 2 samples (less than min_k=3) |
| 391 | + embeddings = np.array([[0.1, 0.2], [0.3, 0.4]]) |
| 392 | + |
| 393 | + result = auto_determine_k(embeddings, min_k=3, max_k=5) |
| 394 | + assert result == 2 # Should return max(2, n_samples) |
| 395 | + |
| 396 | + def test_auto_determine_k_exception(self): |
| 397 | + """Test auto_determine_k with exception during calculation""" |
| 398 | + embeddings = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) |
| 399 | + |
| 400 | + # Mock silhouette_score to raise exception |
| 401 | + with patch('sklearn.metrics.silhouette_score') as mock_silhouette: |
| 402 | + mock_silhouette.side_effect = Exception("Silhouette error") |
| 403 | + |
| 404 | + result = auto_determine_k(embeddings, min_k=2, max_k=3) |
| 405 | + # Should use heuristic fallback |
| 406 | + assert isinstance(result, int) |
| 407 | + assert result >= 2 |
| 408 | + |
| 409 | + def test_kmeans_cluster_documents_empty(self): |
| 410 | + """Test kmeans_cluster_documents with empty embeddings""" |
| 411 | + result = kmeans_cluster_documents({}) |
| 412 | + assert result == {} |
| 413 | + |
| 414 | + def test_kmeans_cluster_documents_exception(self): |
| 415 | + """Test kmeans_cluster_documents with exception""" |
| 416 | + doc_embeddings = { |
| 417 | + 'doc1': np.array([0.1, 0.2, 0.3]), |
| 418 | + 'doc2': np.array([0.4, 0.5, 0.6]) |
| 419 | + } |
| 420 | + |
| 421 | + # Mock auto_determine_k to raise exception |
| 422 | + with patch('backend.utils.document_vector_utils.auto_determine_k') as mock_auto_k: |
| 423 | + mock_auto_k.side_effect = Exception("Auto K error") |
| 424 | + |
| 425 | + with pytest.raises(Exception, match="Failed to cluster documents"): |
| 426 | + kmeans_cluster_documents(doc_embeddings) |
| 427 | + |
| 428 | + def test_process_documents_for_clustering_exception(self): |
| 429 | + """Test process_documents_for_clustering with exception""" |
| 430 | + mock_es_core = MagicMock() |
| 431 | + mock_es_core.client.search.side_effect = Exception("ES error") |
| 432 | + |
| 433 | + with pytest.raises(Exception, match="Failed to process documents"): |
| 434 | + process_documents_for_clustering('test_index', mock_es_core) |
| 435 | + |
| 436 | + def test_process_documents_for_clustering_no_embeddings(self): |
| 437 | + """Test process_documents_for_clustering when some documents fail embedding calculation""" |
| 438 | + mock_es_core = MagicMock() |
| 439 | + mock_es_core.client.search.return_value = { |
| 440 | + 'aggregations': { |
| 441 | + 'unique_documents': { |
| 442 | + 'buckets': [ |
| 443 | + {'key': '/path/doc1.pdf', 'doc_count': 1} |
| 444 | + ] |
| 445 | + } |
| 446 | + }, |
| 447 | + 'hits': { |
| 448 | + 'hits': [ |
| 449 | + { |
| 450 | + '_source': { |
| 451 | + 'filename': 'doc1.pdf', |
| 452 | + 'content': 'test content', |
| 453 | + 'embedding': [0.1, 0.2, 0.3], |
| 454 | + 'file_size': 1000 |
| 455 | + } |
| 456 | + } |
| 457 | + ] |
| 458 | + } |
| 459 | + } |
| 460 | + |
| 461 | + # Mock calculate_document_embedding to return None |
| 462 | + with patch('backend.utils.document_vector_utils.calculate_document_embedding') as mock_calc: |
| 463 | + mock_calc.return_value = None |
| 464 | + |
| 465 | + docs, embeddings = process_documents_for_clustering('test_index', mock_es_core) |
| 466 | + assert isinstance(docs, dict) |
| 467 | + assert isinstance(embeddings, dict) |
| 468 | + assert len(embeddings) == 0 # No successful embeddings |
| 469 | + |
| 470 | + def test_extract_cluster_content_missing_doc(self): |
| 471 | + """Test extract_cluster_content with missing document""" |
| 472 | + document_samples = { |
| 473 | + 'doc1': { |
| 474 | + 'chunks': [{'content': 'test content'}] |
| 475 | + } |
| 476 | + } |
| 477 | + cluster_doc_ids = ['doc1', 'missing_doc'] |
| 478 | + |
| 479 | + result = extract_cluster_content(document_samples, cluster_doc_ids) |
| 480 | + assert isinstance(result, str) |
| 481 | + assert 'test content' in result |
| 482 | + |
| 483 | + def test_extract_cluster_content_no_chunks(self): |
| 484 | + """Test extract_cluster_content with document having no chunks""" |
| 485 | + document_samples = { |
| 486 | + 'doc1': { |
| 487 | + 'chunks': [] |
| 488 | + } |
| 489 | + } |
| 490 | + cluster_doc_ids = ['doc1'] |
| 491 | + |
| 492 | + result = extract_cluster_content(document_samples, cluster_doc_ids) |
| 493 | + assert isinstance(result, str) |
| 494 | + |
| 495 | + def test_extract_representative_chunks_smart_import_error(self): |
| 496 | + """Test extract_representative_chunks_smart with ImportError""" |
| 497 | + chunks = [ |
| 498 | + {'content': 'chunk 1'}, |
| 499 | + {'content': 'chunk 2'}, |
| 500 | + {'content': 'chunk 3'} |
| 501 | + ] |
| 502 | + |
| 503 | + # Mock the import to raise ImportError |
| 504 | + with patch('builtins.__import__', side_effect=ImportError("Module not found")): |
| 505 | + result = extract_representative_chunks_smart(chunks, max_chunks=2) |
| 506 | + assert len(result) <= 2 |
| 507 | + assert len(result) > 0 |
| 508 | + |
| 509 | + def test_extract_representative_chunks_smart_short_content(self): |
| 510 | + """Test extract_representative_chunks_smart with short content""" |
| 511 | + chunks = [ |
| 512 | + {'content': 'short'}, |
| 513 | + {'content': 'also short'}, |
| 514 | + {'content': 'very short content'} |
| 515 | + ] |
| 516 | + |
| 517 | + result = extract_representative_chunks_smart(chunks, max_chunks=2) |
| 518 | + assert len(result) <= 2 |
| 519 | + assert len(result) > 0 |
| 520 | + |
| 521 | + def test_analyze_cluster_coherence_empty(self): |
| 522 | + """Test analyze_cluster_coherence with empty cluster_doc_ids""" |
| 523 | + document_samples = { |
| 524 | + 'doc1': { |
| 525 | + 'chunks': [{'content': 'test content'}] |
| 526 | + } |
| 527 | + } |
| 528 | + cluster_doc_ids = [] |
| 529 | + |
| 530 | + result = analyze_cluster_coherence(cluster_doc_ids, document_samples) |
| 531 | + assert result == {} |
| 532 | + |
| 533 | + def test_analyze_cluster_coherence_missing_doc(self): |
| 534 | + """Test analyze_cluster_coherence with missing document""" |
| 535 | + document_samples = { |
| 536 | + 'doc1': { |
| 537 | + 'chunks': [{'content': 'test content'}] |
| 538 | + } |
| 539 | + } |
| 540 | + cluster_doc_ids = ['doc1', 'missing_doc'] |
| 541 | + |
| 542 | + result = analyze_cluster_coherence(cluster_doc_ids, document_samples) |
| 543 | + assert isinstance(result, dict) |
| 544 | + |
| 545 | + def test_analyze_cluster_coherence_no_chunks(self): |
| 546 | + """Test analyze_cluster_coherence with document having no chunks""" |
| 547 | + document_samples = { |
| 548 | + 'doc1': { |
| 549 | + 'chunks': [] |
| 550 | + } |
| 551 | + } |
| 552 | + cluster_doc_ids = ['doc1'] |
| 553 | + |
| 554 | + result = analyze_cluster_coherence(cluster_doc_ids, document_samples) |
| 555 | + assert isinstance(result, dict) |
| 556 | + |
| 557 | + def test_summarize_clusters_map_reduce_missing_doc(self): |
| 558 | + """Test summarize_clusters_map_reduce with missing document""" |
| 559 | + document_samples = { |
| 560 | + 'doc1': { |
| 561 | + 'chunks': [{'content': 'test content'}], |
| 562 | + 'filename': 'test.pdf' |
| 563 | + } |
| 564 | + } |
| 565 | + clusters = {0: ['doc1', 'missing_doc']} |
| 566 | + |
| 567 | + with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc: |
| 568 | + mock_sum_doc.return_value = "Doc summary" |
| 569 | + |
| 570 | + with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster: |
| 571 | + mock_sum_cluster.return_value = "Cluster summary" |
| 572 | + |
| 573 | + result = summarize_clusters_map_reduce(document_samples, clusters) |
| 574 | + assert isinstance(result, dict) |
| 575 | + assert 0 in result |
| 576 | + |
| 577 | + def test_summarize_clusters_map_reduce_few_chunks(self): |
| 578 | + """Test summarize_clusters_map_reduce with document having few chunks""" |
| 579 | + document_samples = { |
| 580 | + 'doc1': { |
| 581 | + 'chunks': [ |
| 582 | + {'content': 'chunk 1'}, |
| 583 | + {'content': 'chunk 2'} |
| 584 | + ], |
| 585 | + 'filename': 'test.pdf' |
| 586 | + } |
| 587 | + } |
| 588 | + clusters = {0: ['doc1']} |
| 589 | + |
| 590 | + with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc: |
| 591 | + mock_sum_doc.return_value = "Doc summary" |
| 592 | + |
| 593 | + with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster: |
| 594 | + mock_sum_cluster.return_value = "Cluster summary" |
| 595 | + |
| 596 | + result = summarize_clusters_map_reduce(document_samples, clusters) |
| 597 | + assert isinstance(result, dict) |
| 598 | + assert 0 in result |
| 599 | + |
| 600 | + def test_summarize_clusters_map_reduce_long_content(self): |
| 601 | + """Test summarize_clusters_map_reduce with long content""" |
| 602 | + long_content = 'x' * 1500 # Longer than 1000 chars |
| 603 | + document_samples = { |
| 604 | + 'doc1': { |
| 605 | + 'chunks': [ |
| 606 | + {'content': long_content} |
| 607 | + ], |
| 608 | + 'filename': 'test.pdf' |
| 609 | + } |
| 610 | + } |
| 611 | + clusters = {0: ['doc1']} |
| 612 | + |
| 613 | + with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc: |
| 614 | + mock_sum_doc.return_value = "Doc summary" |
| 615 | + |
| 616 | + with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster: |
| 617 | + mock_sum_cluster.return_value = "Cluster summary" |
| 618 | + |
| 619 | + result = summarize_clusters_map_reduce(document_samples, clusters) |
| 620 | + assert isinstance(result, dict) |
| 621 | + assert 0 in result |
| 622 | + |
| 623 | + def test_summarize_clusters_map_reduce_no_valid_docs(self): |
| 624 | + """Test summarize_clusters_map_reduce with no valid document summaries""" |
| 625 | + document_samples = { |
| 626 | + 'doc1': { |
| 627 | + 'chunks': [{'content': 'test content'}], |
| 628 | + 'filename': 'test.pdf' |
| 629 | + } |
| 630 | + } |
| 631 | + clusters = {0: ['doc1']} |
| 632 | + |
| 633 | + with patch('backend.utils.document_vector_utils.summarize_document') as mock_sum_doc: |
| 634 | + mock_sum_doc.return_value = "" # Empty summary |
| 635 | + |
| 636 | + with patch('backend.utils.document_vector_utils.summarize_cluster') as mock_sum_cluster: |
| 637 | + mock_sum_cluster.return_value = "Cluster summary" |
| 638 | + |
| 639 | + result = summarize_clusters_map_reduce(document_samples, clusters) |
| 640 | + assert isinstance(result, dict) |
| 641 | + assert 0 in result |
| 642 | + |
| 643 | + def test_summarize_cluster_legacy_exception(self): |
| 644 | + """Test summarize_cluster_legacy with exception""" |
| 645 | + cluster_content = "Test cluster content" |
| 646 | + |
| 647 | + # Mock file operations to raise exception |
| 648 | + with patch('builtins.open', side_effect=Exception("File error")): |
| 649 | + result = summarize_cluster_legacy(cluster_content) |
| 650 | + assert "Failed to generate summary" in result |
| 651 | + |
0 commit comments