Skip to content

Commit 2420983

Browse files
Merge pull request #1225 from NASA-IMPACT/1190-add-tests-for-job-generation-pipeline
Tests for Config & Job Creation + XML Processing
2 parents 111cdfa + b314e68 commit 2420983

File tree

9 files changed

+893
-716
lines changed

9 files changed

+893
-716
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ For each PR made, an entry should be added to this changelog. It should contain
4343
- Made `match_pattern_type` searchable
4444
- Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table`
4545

46+
- 1190-add-tests-for-job-generation-pipeline
47+
- Description: Tests have been added to enhance coverage for the config and job creation pipeline, alongside comprehensive tests for XML processing.
48+
- Changes:
49+
- Added config_generation/tests/test_config_generation_pipeline.py which tests the config and job generation pipeline, ensuring all components interact correctly
50+
- config_generation/tests/test_db_to_xml.py is updated to include comprehensive tests for XML Processing
51+
4652
- 1001-tests-for-critical-functionalities
4753
- Description: Critical functionalities have been identified and listed, and critical areas lacking tests listed
4854
- Changes:

config_generation/db_to_xml.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -148,35 +148,51 @@ def convert_template_to_scraper(self, collection) -> None:
148148
scraper_config = self.update_config_xml()
149149
return scraper_config
150150

151-
def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
151+
def convert_template_to_job(self, collection, job_source) -> None:
152152
"""
153-
assuming this class has been instantiated with the scraper_template.xml
153+
assuming this class has been instantiated with the job_template.xml
154+
"""
155+
self.update_or_add_element_value("Collection", f"/{job_source}/{collection.config_folder}/")
156+
job_config = self.update_config_xml()
157+
return job_config
158+
159+
def convert_template_to_indexer(self, scraper_editor) -> None:
160+
"""
161+
assuming this class has been instantiated with the final_config_template.xml
154162
"""
155163

156164
transfer_fields = [
157-
"KeepHashFragmentInUrl",
158-
"CorrectDomainCookies",
159-
"IgnoreSessionCookies",
160-
"DownloadImages",
161-
"DownloadMedia",
162-
"DownloadCss",
163-
"DownloadFtp",
164-
"DownloadFile",
165-
"IndexJs",
166-
"FollowJs",
167-
"CrawlFlash",
168-
"NormalizeSecureSchemesWhenTestingVisited",
169-
"RetryCount",
170-
"RetryPause",
171-
"AddBaseHref",
172-
"AddMetaContentType",
173-
"NormalizeUrls",
165+
"Throttle",
174166
]
175167

176168
double_transfer_fields = [
177-
("UrlAccess", "AllowXPathCookies"),
178169
("UrlAccess", "UseBrowserForWebRequests"),
179-
("UrlAccess", "UseHttpClientForWebRequests"),
170+
("UrlAccess", "BrowserForWebRequestsReadinessThreshold"),
171+
("UrlAccess", "BrowserForWebRequestsInitialDelay"),
172+
("UrlAccess", "BrowserForWebRequestsMaxTotalDelay"),
173+
("UrlAccess", "BrowserForWebRequestsMaxResourcesDelay"),
174+
("UrlAccess", "BrowserForWebRequestsLogLevel"),
175+
("UrlAccess", "BrowserForWebRequestsViewportWidth"),
176+
("UrlAccess", "BrowserForWebRequestsViewportHeight"),
177+
("UrlAccess", "BrowserForWebRequestsAdditionalJavascript"),
178+
("UrlAccess", "PostLoginUrl"),
179+
("UrlAccess", "PostLoginData"),
180+
("UrlAccess", "GetBeforePostLogin"),
181+
("UrlAccess", "PostLoginAutoRedirect"),
182+
("UrlAccess", "ReLoginCount"),
183+
("UrlAccess", "ReLoginDelay"),
184+
("UrlAccess", "DetectHtmlLoginPattern"),
185+
("IndexerClient", "RetryTimeout"),
186+
("IndexerClient", "RetrySleep"),
187+
]
188+
189+
triple_transfer_fields = [
190+
("UrlAccess", "BrowserLogin", "Activate"),
191+
("UrlAccess", "BrowserLogin", "RemoteDebuggingPort"),
192+
("UrlAccess", "BrowserLogin", "BrowserLogLevel"),
193+
("UrlAccess", "BrowserLogin", "ShowDevTools"),
194+
("UrlAccess", "BrowserLogin", "SuccessCondition"),
195+
("UrlAccess", "BrowserLogin", "CookieFilter"),
180196
]
181197

182198
for field in transfer_fields:
@@ -187,18 +203,15 @@ def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
187203
f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)
188204
)
189205

206+
for grandparent, parent, child in triple_transfer_fields:
207+
self.update_or_add_element_value(
208+
f"{grandparent}/{parent}/{child}",
209+
scraper_editor.get_tag_value(f"{grandparent}/{parent}/{child}", strict=True),
210+
)
211+
190212
scraper_config = self.update_config_xml()
191213
return scraper_config
192214

193-
def convert_template_to_indexer(self, collection) -> None:
194-
"""
195-
assuming this class has been instantiated with the indexer_template.xml
196-
"""
197-
self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
198-
indexer_config = self.update_config_xml()
199-
200-
return indexer_config
201-
202215
def _mapping_exists(self, new_mapping: ET.Element):
203216
"""
204217
Check if the mapping with given parameters already exists in the XML tree
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from unittest.mock import MagicMock, call, patch
2+
3+
from django.test import TestCase
4+
5+
from sde_collections.models.collection import Collection
6+
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
7+
8+
"""
9+
Workflow status change → Opens template → Applies XML transformation → Writes to GitHub.
10+
11+
- When the `workflow_status` changes, it triggers the relevant config creation method.
12+
- The method reads an template and processes it using `XmlEditor`.
13+
- `XmlEditor` modifies the template by injecting collection-specific values and transformations.
14+
- The generated XML is passed to `_write_to_github()`, which commits it directly to GitHub.
15+
16+
Note: This test verifies that the correct methods are triggered and XML content is passed to GitHub.
17+
The actual XML structure and correctness are tested separately in `test_db_xml.py`.
18+
"""
19+
20+
21+
class TestConfigCreation(TestCase):
22+
def setUp(self):
23+
self.collection = Collection.objects.create(
24+
name="Test Collection", division="1", workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS
25+
)
26+
27+
@patch("sde_collections.utils.github_helper.GitHubHandler") # Mock GitHubHandler
28+
@patch("sde_collections.models.collection.Collection._write_to_github")
29+
@patch("sde_collections.models.collection.XmlEditor")
30+
def test_ready_for_engineering_triggers_config_and_job_creation(
31+
self, MockXmlEditor, mock_write_to_github, MockGitHubHandler
32+
):
33+
"""
34+
When the collection's workflow status is updated to READY_FOR_ENGINEERING,
35+
it should trigger the creation of scraper configuration and job files.
36+
"""
37+
# Mock GitHubHandler to avoid actual API calls
38+
mock_github_instance = MockGitHubHandler.return_value
39+
mock_github_instance.create_file.return_value = None
40+
mock_github_instance.create_or_update_file.return_value = None
41+
42+
# Set up the XmlEditor mock for both config and job
43+
mock_editor_instance = MockXmlEditor.return_value
44+
mock_editor_instance.convert_template_to_scraper.return_value = "<scraper_config>config_data</scraper_config>"
45+
mock_editor_instance.convert_template_to_job.return_value = "<scraper_job>job_data</scraper_job>"
46+
47+
# Simulate the status change to READY_FOR_ENGINEERING
48+
self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING
49+
self.collection.save()
50+
51+
# Verify that the XML for both config and job are generated and written to GitHub
52+
expected_calls = [
53+
call(self.collection._scraper_config_path, "<scraper_config>config_data</scraper_config>", False),
54+
call(self.collection._scraper_job_path, "<scraper_job>job_data</scraper_job>", False),
55+
]
56+
mock_write_to_github.assert_has_calls(expected_calls, any_order=True)
57+
58+
@patch("sde_collections.models.collection.GitHubHandler") # Mock GitHubHandler in the correct module path
59+
@patch("sde_collections.models.collection.Collection._write_to_github")
60+
@patch("sde_collections.models.collection.XmlEditor")
61+
def test_ready_for_curation_triggers_indexer_config_and_job_creation(
62+
self, MockXmlEditor, mock_write_to_github, MockGitHubHandler
63+
):
64+
"""
65+
When the collection's workflow status is updated to READY_FOR_CURATION,
66+
it should trigger indexer config and job creation methods.
67+
"""
68+
# Mock GitHubHandler to avoid actual API calls
69+
mock_github_instance = MockGitHubHandler.return_value
70+
mock_github_instance.check_file_exists.return_value = True # Assume scraper exists
71+
mock_github_instance._get_file_contents.return_value = MagicMock()
72+
mock_github_instance._get_file_contents.return_value.decoded_content = (
73+
b"<scraper_config>Mock Data</scraper_config>"
74+
)
75+
76+
# Set up the XmlEditor mock for both config and job
77+
mock_editor_instance = MockXmlEditor.return_value
78+
mock_editor_instance.convert_template_to_indexer.return_value = "<indexer_config>config_data</indexer_config>"
79+
mock_editor_instance.convert_template_to_job.return_value = "<indexer_job>job_data</indexer_job>"
80+
81+
# Simulate the status change to READY_FOR_CURATION
82+
self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
83+
self.collection.save()
84+
85+
# Verify that the XML for both indexer config and job are generated and written to GitHub
86+
expected_calls = [
87+
call(self.collection._indexer_config_path, "<indexer_config>config_data</indexer_config>", True),
88+
call(self.collection._indexer_job_path, "<indexer_job>job_data</indexer_job>", False),
89+
]
90+
mock_write_to_github.assert_has_calls(expected_calls, any_order=True)
Lines changed: 108 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import xml.etree.ElementTree as ET
1+
# docker-compose -f local.yml run --rm django pytest config_generation/tests/test_db_to_xml.py
2+
from xml.etree.ElementTree import ElementTree, ParseError, fromstring
3+
4+
import pytest
25

36
from ..db_to_xml import XmlEditor
47

@@ -28,39 +31,112 @@ def elements_equal(e1, e2):
2831
return False
2932
return all(elements_equal(c1, c2) for c1, c2 in zip(e1, e2))
3033

31-
tree1 = ET.fromstring(xml1)
32-
tree2 = ET.fromstring(xml2)
33-
return elements_equal(tree1, tree2)
34+
tree1 = ElementTree(fromstring(xml1))
35+
tree2 = ElementTree(fromstring(xml2))
3436

37+
return elements_equal(tree1.getroot(), tree2.getroot())
3538

36-
def test_update_or_add_element_value():
37-
xml_string = """<root>
38-
<child>
39-
<grandchild>old_value</grandchild>
40-
</child>
41-
</root>"""
4239

40+
# Tests for valid and invalid XML initializations
41+
def test_valid_xml_initialization():
42+
xml_string = "<root><child>Test</child></root>"
4343
editor = XmlEditor(xml_string)
44+
assert editor.get_tag_value("child") == ["Test"]
4445

45-
# To update an existing element's value
46-
updated_xml = editor.update_or_add_element_value("child/grandchild", "new_value")
47-
expected_output = """<root>
48-
<child>
49-
<grandchild>new_value</grandchild>
50-
</child>
51-
</root>
52-
"""
53-
assert xmls_equal(updated_xml, expected_output)
54-
55-
# To create a new element and set its value
56-
new_xml = editor.update_or_add_element_value("newchild", "some_value")
57-
expected_output = """<root>
58-
<child>
59-
<grandchild>new_value</grandchild>
60-
</child>
61-
<newchild>
62-
some_value
63-
</newchild>
64-
</root>
65-
"""
66-
assert xmls_equal(new_xml, expected_output)
46+
47+
def test_invalid_xml_initialization():
48+
with pytest.raises(ParseError):
49+
XmlEditor("<root><child></root>")
50+
51+
52+
# Test retrieval of single and multiple tag values
53+
def test_get_single_tag_value():
54+
xml_string = "<root><child>Test</child></root>"
55+
editor = XmlEditor(xml_string)
56+
assert editor.get_tag_value("child", strict=True) == "Test"
57+
58+
59+
def test_get_nonexistent_tag_value():
60+
xml_string = "<root><child>Test</child></root>"
61+
editor = XmlEditor(xml_string)
62+
assert editor.get_tag_value("nonexistent", strict=False) == []
63+
64+
65+
def test_get_tag_value_strict_multiple_elements():
66+
xml_string = "<root><child>One</child><child>Two</child></root>"
67+
editor = XmlEditor(xml_string)
68+
with pytest.raises(ValueError):
69+
editor.get_tag_value("child", strict=True)
70+
71+
72+
# Test updating and adding XML elements
73+
def test_update_existing_element():
74+
xml_string = "<root><child>Old</child></root>"
75+
editor = XmlEditor(xml_string)
76+
editor.update_or_add_element_value("child", "New")
77+
updated_xml = editor.update_config_xml()
78+
assert "New" in updated_xml and "Old" not in updated_xml
79+
80+
81+
def test_add_new_element():
82+
xml_string = "<root></root>"
83+
editor = XmlEditor(xml_string)
84+
editor.update_or_add_element_value("newchild", "Value")
85+
updated_xml = editor.update_config_xml()
86+
assert "Value" in updated_xml and "<newchild>Value</newchild>" in updated_xml
87+
88+
89+
def test_add_third_level_hierarchy():
90+
xml_string = "<root></root>"
91+
editor = XmlEditor(xml_string)
92+
editor.update_or_add_element_value("parent/child/grandchild", "DeeplyNested")
93+
updated_xml = editor.update_config_xml()
94+
root = fromstring(updated_xml)
95+
grandchild = root.find(".//grandchild")
96+
assert grandchild is not None, "Grandchild element not found"
97+
assert grandchild.text == "DeeplyNested", "Grandchild does not contain the correct text"
98+
99+
# Check complete path
100+
parent = root.find(".//parent/child/grandchild")
101+
assert parent is not None, "Complete path to grandchild not found"
102+
assert parent.text == "DeeplyNested", "Complete path to grandchild does not contain correct text"
103+
104+
105+
# Test transformations and generic mapping
106+
def test_convert_indexer_to_scraper_transformation():
107+
xml_string = """<root><Plugin>Indexer</Plugin></root>"""
108+
editor = XmlEditor(xml_string)
109+
editor.convert_indexer_to_scraper()
110+
updated_xml = editor.update_config_xml()
111+
assert "<Plugin>SMD_Plugins/Sinequa.Plugin.ListCandidateUrls</Plugin>" in updated_xml
112+
assert "<Plugin>Indexer</Plugin>" not in updated_xml
113+
114+
115+
def test_generic_mapping_addition():
116+
xml_string = "<root></root>"
117+
editor = XmlEditor(xml_string)
118+
editor._generic_mapping(name="id", value="doc.url1", selection="url1")
119+
updated_xml = editor.update_config_xml()
120+
assert "<Mapping>" in updated_xml
121+
assert "<Name>id</Name>" in updated_xml
122+
assert "<Value>doc.url1</Value>" in updated_xml
123+
124+
125+
# Test XML serialization with headers
126+
def test_xml_serialization_with_header():
127+
xml_string = "<root><child>Value</child></root>"
128+
editor = XmlEditor(xml_string)
129+
xml_output = editor.update_config_xml()
130+
assert '<?xml version="1.0" encoding="utf-8"?>' in xml_output
131+
assert "<root>" in xml_output and "<child>Value</child>" in xml_output
132+
133+
134+
# Test handling multiple changes accumulation
135+
def test_multiple_changes_accumulation():
136+
xml_string = "<root><child>Initial</child></root>"
137+
editor = XmlEditor(xml_string)
138+
editor.update_or_add_element_value("child", "Modified")
139+
editor.update_or_add_element_value("newchild", "Added")
140+
updated_xml = editor.update_config_xml()
141+
assert "Modified" in updated_xml and "Added" in updated_xml
142+
assert "Initial" not in updated_xml

0 commit comments

Comments
 (0)