NASA-IMPACT
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎config_generation/db_to_xml.py‎
Lines changed: 43 additions & 30 deletions b/‎config_generation/db_to_xml.py‎
Lines changed: 43 additions & 30 deletions
diff --git a/‎config_generation/tests/test_config_generation_pipeline.py‎
Lines changed: 90 additions & 0 deletions b/‎config_generation/tests/test_config_generation_pipeline.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎config_generation/tests/test_db_to_xml.py‎
Lines changed: 108 additions & 32 deletions b/‎config_generation/tests/test_db_to_xml.py‎
Lines changed: 108 additions & 32 deletions
@@ -43,6 +43,12 @@ For each PR made, an entry should be added to this changelog. It should contain
     - Made `match_pattern_type` searchable
     - Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table`
 
+- 1190-add-tests-for-job-generation-pipeline
+  - Description: Tests have been added to enhance coverage for the config and job creation pipeline, alongside comprehensive tests for XML processing.
+  - Changes:
+    - Added config_generation/tests/test_config_generation_pipeline.py which tests the config and job generation pipeline, ensuring all components interact correctly
+    - config_generation/tests/test_db_to_xml.py is updated to include comprehensive tests for XML Processing
+
 - 1001-tests-for-critical-functionalities
   - Description: Critical functionalities have been identified and listed, and critical areas lacking tests listed
   - Changes:
 
@@ -148,35 +148,51 @@ def convert_template_to_scraper(self, collection) -> None:
         scraper_config = self.update_config_xml()
         return scraper_config
 
-    def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
+    def convert_template_to_job(self, collection, job_source) -> None:
         """
-        assuming this class has been instantiated with the scraper_template.xml
+        assuming this class has been instantiated with the job_template.xml
+        """
+        self.update_or_add_element_value("Collection", f"/{job_source}/{collection.config_folder}/")
+        job_config = self.update_config_xml()
+        return job_config
+
+    def convert_template_to_indexer(self, scraper_editor) -> None:
+        """
+        assuming this class has been instantiated with the final_config_template.xml
         """
 
         transfer_fields = [
-            "KeepHashFragmentInUrl",
-            "CorrectDomainCookies",
-            "IgnoreSessionCookies",
-            "DownloadImages",
-            "DownloadMedia",
-            "DownloadCss",
-            "DownloadFtp",
-            "DownloadFile",
-            "IndexJs",
-            "FollowJs",
-            "CrawlFlash",
-            "NormalizeSecureSchemesWhenTestingVisited",
-            "RetryCount",
-            "RetryPause",
-            "AddBaseHref",
-            "AddMetaContentType",
-            "NormalizeUrls",
+            "Throttle",
         ]
 
         double_transfer_fields = [
-            ("UrlAccess", "AllowXPathCookies"),
             ("UrlAccess", "UseBrowserForWebRequests"),
-            ("UrlAccess", "UseHttpClientForWebRequests"),
+            ("UrlAccess", "BrowserForWebRequestsReadinessThreshold"),
+            ("UrlAccess", "BrowserForWebRequestsInitialDelay"),
+            ("UrlAccess", "BrowserForWebRequestsMaxTotalDelay"),
+            ("UrlAccess", "BrowserForWebRequestsMaxResourcesDelay"),
+            ("UrlAccess", "BrowserForWebRequestsLogLevel"),
+            ("UrlAccess", "BrowserForWebRequestsViewportWidth"),
+            ("UrlAccess", "BrowserForWebRequestsViewportHeight"),
+            ("UrlAccess", "BrowserForWebRequestsAdditionalJavascript"),
+            ("UrlAccess", "PostLoginUrl"),
+            ("UrlAccess", "PostLoginData"),
+            ("UrlAccess", "GetBeforePostLogin"),
+            ("UrlAccess", "PostLoginAutoRedirect"),
+            ("UrlAccess", "ReLoginCount"),
+            ("UrlAccess", "ReLoginDelay"),
+            ("UrlAccess", "DetectHtmlLoginPattern"),
+            ("IndexerClient", "RetryTimeout"),
+            ("IndexerClient", "RetrySleep"),
+        ]
+
+        triple_transfer_fields = [
+            ("UrlAccess", "BrowserLogin", "Activate"),
+            ("UrlAccess", "BrowserLogin", "RemoteDebuggingPort"),
+            ("UrlAccess", "BrowserLogin", "BrowserLogLevel"),
+            ("UrlAccess", "BrowserLogin", "ShowDevTools"),
+            ("UrlAccess", "BrowserLogin", "SuccessCondition"),
+            ("UrlAccess", "BrowserLogin", "CookieFilter"),
         ]
 
         for field in transfer_fields:
@@ -187,18 +203,15 @@ def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
                 f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)
             )
 
+        for grandparent, parent, child in triple_transfer_fields:
+            self.update_or_add_element_value(
+                f"{grandparent}/{parent}/{child}",
+                scraper_editor.get_tag_value(f"{grandparent}/{parent}/{child}", strict=True),
+            )
+
         scraper_config = self.update_config_xml()
         return scraper_config
 
-    def convert_template_to_indexer(self, collection) -> None:
-        """
-        assuming this class has been instantiated with the indexer_template.xml
-        """
-        self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
-        indexer_config = self.update_config_xml()
-
-        return indexer_config
-
     def _mapping_exists(self, new_mapping: ET.Element):
         """
         Check if the mapping with given parameters already exists in the XML tree
 
@@ -0,0 +1,90 @@
+from unittest.mock import MagicMock, call, patch
+
+from django.test import TestCase
+
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+"""
+Workflow status change → Opens template → Applies XML transformation → Writes to GitHub.
+
+- When the `workflow_status` changes, it triggers the relevant config creation method.
+- The method reads an template and processes it using `XmlEditor`.
+- `XmlEditor` modifies the template by injecting collection-specific values and transformations.
+- The generated XML is passed to `_write_to_github()`, which commits it directly to GitHub.
+
+Note: This test verifies that the correct methods are triggered and XML content is passed to GitHub.
+The actual XML structure and correctness are tested separately in `test_db_xml.py`.
+"""
+
+
+class TestConfigCreation(TestCase):
+    def setUp(self):
+        self.collection = Collection.objects.create(
+            name="Test Collection", division="1", workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS
+        )
+
+    @patch("sde_collections.utils.github_helper.GitHubHandler")  # Mock GitHubHandler
+    @patch("sde_collections.models.collection.Collection._write_to_github")
+    @patch("sde_collections.models.collection.XmlEditor")
+    def test_ready_for_engineering_triggers_config_and_job_creation(
+        self, MockXmlEditor, mock_write_to_github, MockGitHubHandler
+    ):
+        """
+        When the collection's workflow status is updated to READY_FOR_ENGINEERING,
+        it should trigger the creation of scraper configuration and job files.
+        """
+        # Mock GitHubHandler to avoid actual API calls
+        mock_github_instance = MockGitHubHandler.return_value
+        mock_github_instance.create_file.return_value = None
+        mock_github_instance.create_or_update_file.return_value = None
+
+        # Set up the XmlEditor mock for both config and job
+        mock_editor_instance = MockXmlEditor.return_value
+        mock_editor_instance.convert_template_to_scraper.return_value = "<scraper_config>config_data</scraper_config>"
+        mock_editor_instance.convert_template_to_job.return_value = "<scraper_job>job_data</scraper_job>"
+
+        # Simulate the status change to READY_FOR_ENGINEERING
+        self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING
+        self.collection.save()
+
+        # Verify that the XML for both config and job are generated and written to GitHub
+        expected_calls = [
+            call(self.collection._scraper_config_path, "<scraper_config>config_data</scraper_config>", False),
+            call(self.collection._scraper_job_path, "<scraper_job>job_data</scraper_job>", False),
+        ]
+        mock_write_to_github.assert_has_calls(expected_calls, any_order=True)
+
+    @patch("sde_collections.models.collection.GitHubHandler")  # Mock GitHubHandler in the correct module path
+    @patch("sde_collections.models.collection.Collection._write_to_github")
+    @patch("sde_collections.models.collection.XmlEditor")
+    def test_ready_for_curation_triggers_indexer_config_and_job_creation(
+        self, MockXmlEditor, mock_write_to_github, MockGitHubHandler
+    ):
+        """
+        When the collection's workflow status is updated to READY_FOR_CURATION,
+        it should trigger indexer config and job creation methods.
+        """
+        # Mock GitHubHandler to avoid actual API calls
+        mock_github_instance = MockGitHubHandler.return_value
+        mock_github_instance.check_file_exists.return_value = True  # Assume scraper exists
+        mock_github_instance._get_file_contents.return_value = MagicMock()
+        mock_github_instance._get_file_contents.return_value.decoded_content = (
+            b"<scraper_config>Mock Data</scraper_config>"
+        )
+
+        # Set up the XmlEditor mock for both config and job
+        mock_editor_instance = MockXmlEditor.return_value
+        mock_editor_instance.convert_template_to_indexer.return_value = "<indexer_config>config_data</indexer_config>"
+        mock_editor_instance.convert_template_to_job.return_value = "<indexer_job>job_data</indexer_job>"
+
+        # Simulate the status change to READY_FOR_CURATION
+        self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
+        self.collection.save()
+
+        # Verify that the XML for both indexer config and job are generated and written to GitHub
+        expected_calls = [
+            call(self.collection._indexer_config_path, "<indexer_config>config_data</indexer_config>", True),
+            call(self.collection._indexer_job_path, "<indexer_job>job_data</indexer_job>", False),
+        ]
+        mock_write_to_github.assert_has_calls(expected_calls, any_order=True)
@@ -1,4 +1,7 @@
-import xml.etree.ElementTree as ET
+# docker-compose -f local.yml run --rm django pytest config_generation/tests/test_db_to_xml.py
+from xml.etree.ElementTree import ElementTree, ParseError, fromstring
+
+import pytest
 
 from ..db_to_xml import XmlEditor
 
@@ -28,39 +31,112 @@ def elements_equal(e1, e2):
             return False
         return all(elements_equal(c1, c2) for c1, c2 in zip(e1, e2))
 
-    tree1 = ET.fromstring(xml1)
-    tree2 = ET.fromstring(xml2)
-    return elements_equal(tree1, tree2)
+    tree1 = ElementTree(fromstring(xml1))
+    tree2 = ElementTree(fromstring(xml2))
 
+    return elements_equal(tree1.getroot(), tree2.getroot())
 
-def test_update_or_add_element_value():
-    xml_string = """<root>
-    <child>
-        <grandchild>old_value</grandchild>
-    </child>
-    </root>"""
 
+# Tests for valid and invalid XML initializations
+def test_valid_xml_initialization():
+    xml_string = "<root><child>Test</child></root>"
     editor = XmlEditor(xml_string)
+    assert editor.get_tag_value("child") == ["Test"]
 
-    # To update an existing element's value
-    updated_xml = editor.update_or_add_element_value("child/grandchild", "new_value")
-    expected_output = """<root>
-        <child>
-            <grandchild>new_value</grandchild>
-        </child>
-    </root>
-    """
-    assert xmls_equal(updated_xml, expected_output)
-
-    # To create a new element and set its value
-    new_xml = editor.update_or_add_element_value("newchild", "some_value")
-    expected_output = """<root>
-        <child>
-            <grandchild>new_value</grandchild>
-        </child>
-        <newchild>
-            some_value
-        </newchild>
-    </root>
-    """
-    assert xmls_equal(new_xml, expected_output)
+
+def test_invalid_xml_initialization():
+    with pytest.raises(ParseError):
+        XmlEditor("<root><child></root>")
+
+
+# Test retrieval of single and multiple tag values
+def test_get_single_tag_value():
+    xml_string = "<root><child>Test</child></root>"
+    editor = XmlEditor(xml_string)
+    assert editor.get_tag_value("child", strict=True) == "Test"
+
+
+def test_get_nonexistent_tag_value():
+    xml_string = "<root><child>Test</child></root>"
+    editor = XmlEditor(xml_string)
+    assert editor.get_tag_value("nonexistent", strict=False) == []
+
+
+def test_get_tag_value_strict_multiple_elements():
+    xml_string = "<root><child>One</child><child>Two</child></root>"
+    editor = XmlEditor(xml_string)
+    with pytest.raises(ValueError):
+        editor.get_tag_value("child", strict=True)
+
+
+# Test updating and adding XML elements
+def test_update_existing_element():
+    xml_string = "<root><child>Old</child></root>"
+    editor = XmlEditor(xml_string)
+    editor.update_or_add_element_value("child", "New")
+    updated_xml = editor.update_config_xml()
+    assert "New" in updated_xml and "Old" not in updated_xml
+
+
+def test_add_new_element():
+    xml_string = "<root></root>"
+    editor = XmlEditor(xml_string)
+    editor.update_or_add_element_value("newchild", "Value")
+    updated_xml = editor.update_config_xml()
+    assert "Value" in updated_xml and "<newchild>Value</newchild>" in updated_xml
+
+
+def test_add_third_level_hierarchy():
+    xml_string = "<root></root>"
+    editor = XmlEditor(xml_string)
+    editor.update_or_add_element_value("parent/child/grandchild", "DeeplyNested")
+    updated_xml = editor.update_config_xml()
+    root = fromstring(updated_xml)
+    grandchild = root.find(".//grandchild")
+    assert grandchild is not None, "Grandchild element not found"
+    assert grandchild.text == "DeeplyNested", "Grandchild does not contain the correct text"
+
+    # Check complete path
+    parent = root.find(".//parent/child/grandchild")
+    assert parent is not None, "Complete path to grandchild not found"
+    assert parent.text == "DeeplyNested", "Complete path to grandchild does not contain correct text"
+
+
+# Test transformations and generic mapping
+def test_convert_indexer_to_scraper_transformation():
+    xml_string = """<root><Plugin>Indexer</Plugin></root>"""
+    editor = XmlEditor(xml_string)
+    editor.convert_indexer_to_scraper()
+    updated_xml = editor.update_config_xml()
+    assert "<Plugin>SMD_Plugins/Sinequa.Plugin.ListCandidateUrls</Plugin>" in updated_xml
+    assert "<Plugin>Indexer</Plugin>" not in updated_xml
+
+
+def test_generic_mapping_addition():
+    xml_string = "<root></root>"
+    editor = XmlEditor(xml_string)
+    editor._generic_mapping(name="id", value="doc.url1", selection="url1")
+    updated_xml = editor.update_config_xml()
+    assert "<Mapping>" in updated_xml
+    assert "<Name>id</Name>" in updated_xml
+    assert "<Value>doc.url1</Value>" in updated_xml
+
+
+# Test XML serialization with headers
+def test_xml_serialization_with_header():
+    xml_string = "<root><child>Value</child></root>"
+    editor = XmlEditor(xml_string)
+    xml_output = editor.update_config_xml()
+    assert '<?xml version="1.0" encoding="utf-8"?>' in xml_output
+    assert "<root>" in xml_output and "<child>Value</child>" in xml_output
+
+
+# Test handling multiple changes accumulation
+def test_multiple_changes_accumulation():
+    xml_string = "<root><child>Initial</child></root>"
+    editor = XmlEditor(xml_string)
+    editor.update_or_add_element_value("child", "Modified")
+    editor.update_or_add_element_value("newchild", "Added")
+    updated_xml = editor.update_config_xml()
+    assert "Modified" in updated_xml and "Added" in updated_xml
+    assert "Initial" not in updated_xml