Update tests

BenConstable9 · BenConstable9 · commit dfa9f461a2aa · 2025-02-19T11:37:32.000Z
diff --git a/.github/workflows/ci-checks.yaml b/.github/workflows/ci-checks.yaml
@@ -60,5 +60,5 @@ jobs:
         working-directory: image_processing
 
       - name: Run PyTest
-        run: uv run pytest --cov=image_processing
+        run: uv run pytest --cov=. --cov-config=.coveragerc
         working-directory: image_processing
diff --git a/image_processing/tests/image_processing/test_layout_analysis.py b/image_processing/tests/image_processing/test_layout_analysis.py
@@ -37,6 +37,29 @@ def __init__(self, content):
         self.content = content
 
 
+class DummyPoller:
+    def __init__(self, result, operation_id):
+        self._result = result
+        self.details = {"operation_id": operation_id}
+
+    async def result(self):
+        return self._result
+
+
+class DummyDocIntelligenceClient:
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+
+    async def begin_analyze_document(self, **kwargs):
+        # Create a dummy page spanning the first 5 characters.
+        dummy_page = DummyPage(0, 5, 1)
+        dummy_result = DummyResult("HelloWorld", pages=[dummy_page], figures=[])
+        return DummyPoller(dummy_result, "dummy_op")
+
+
 class DummyFigure:
     def __init__(self, id, offset, length, page_number, caption_content):
         self.id = id  # note: process_figures_from_extracted_content checks "if figure.id is None"
@@ -361,3 +384,54 @@ async def test_process_layout_analysis_missing_source():
     assert result["data"] is None
     assert result["errors"] is not None
     assert "Pass a valid source" in result["errors"][0]["message"]
+
+
+@pytest.mark.asyncio
+async def test_analyse_document_success(monkeypatch, tmp_path):
+    # Create a temporary file with dummy content.
+    tmp_file = tmp_path / "dummy.txt"
+    tmp_file.write_bytes(b"dummy content")
+
+    la = LayoutAnalysis(
+        record_id=999,
+        source="https://dummyaccount.blob.core.windows.net/container/path/to/dummy.txt",
+    )
+
+    # Use an async function to return our dummy Document Intelligence client.
+    async def dummy_get_doc_intelligence_client():
+        return DummyDocIntelligenceClient()
+
+    monkeypatch.setattr(
+        la, "get_document_intelligence_client", dummy_get_doc_intelligence_client
+    )
+
+    await la.analyse_document(str(tmp_file))
+
+    assert la.result is not None
+    assert la.operation_id == "dummy_op"
+    # Check that the dummy result contains the expected content.
+    assert la.result.content == "HelloWorld"
+
+
+def test_create_page_wise_content():
+    # Test create_page_wise_content using a dummy result with one page.
+    la = LayoutAnalysis(record_id=100, source="dummy")
+
+    # Create a dummy result with content "HelloWorld"
+    # and a page with a span from index 0 with length 5.
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    dummy_result.content = "HelloWorld"
+    dummy_result.pages = [DummyPage(0, 5, 1)]
+    la.result = dummy_result
+
+    layouts = la.create_page_wise_content()
+    assert isinstance(layouts, list)
+    assert len(layouts) == 1
+    layout = layouts[0]
+    # The page content should be the substring "Hello"
+    assert layout.content == "Hello"
+    assert layout.page_number == 1
+    assert layout.page_offsets == 0
diff --git a/image_processing/tests/image_processing/test_mark_up_cleaner.py b/image_processing/tests/image_processing/test_mark_up_cleaner.py
@@ -127,3 +127,61 @@ async def test_clean(cleaner, sample_text, figures):
     assert "chunk_figures" in result["data"]
     assert len(result["data"]["chunk_figures"]) == 1
     assert result["data"]["chunk_figures"][0]["FigureId"] == "12345"
+
+
+def test_get_sections_empty_text(cleaner):
+    # When no text is provided, no sections should be found.
+    sections = cleaner.get_sections("")
+    assert sections == []
+
+
+def test_get_figure_ids_no_figures(cleaner):
+    # When the text contains no figure tags, an empty list should be returned.
+    text = "This text does not include any figures."
+    assert cleaner.get_figure_ids(text) == []
+
+
+def test_remove_markdown_tags_unknown_tag(cleaner):
+    # When a tag in tag_patterns does not match anything, text remains unchanged.
+    text = "This is a basic text without markdown."
+    tag_patterns = {"nonexistent": r"(pattern)"}
+    result = cleaner.remove_markdown_tags(text, tag_patterns)
+    assert result == text
+
+
+def test_clean_text_and_extract_metadata_empty_text(cleaner, figures):
+    # Passing an empty text should result in error handling and an empty string being returned.
+    result = cleaner.clean_text_and_extract_metadata("", figures)
+    assert result == ""
+
+
+@pytest.mark.asyncio
+async def test_clean_missing_chunk(cleaner):
+    # When record['data'] is missing the "chunk" key, an exception is raised and the error branch returns a proper error dict.
+    record = {
+        "recordId": "3",
+        "data": {"figures": []},
+    }
+    result = await cleaner.clean(record)
+    assert result["recordId"] == "3"
+    assert result["data"] is None
+    assert result["errors"] is not None
+    assert "Failed to cleanup data" in result["errors"][0]["message"]
+
+
+@pytest.mark.asyncio
+async def test_clean_with_invalid_figures_structure(cleaner):
+    # When figure dicts don't have the expected structure for FigureHolder,
+    # the construction in clean() will raise an exception and trigger error branch.
+    record = {
+        "recordId": "4",
+        "data": {
+            "chunk": "Some text with # Header",
+            # Figures are missing required keys.
+            "figures": [{"invalid_key": "no_fig_id"}],
+        },
+    }
+    result = await cleaner.clean(record)
+    assert result["recordId"] == "4"
+    assert result["data"] is None
+    assert result["errors"] is not None