Skip to content

Commit 6dc0f45

Browse files
committed
Update tests for allowing jsonld_errors to not be part of the always included columns
1 parent c7da921 commit 6dc0f45

File tree

1 file changed

+38
-9
lines changed

1 file changed

+38
-9
lines changed

tests/test_spider.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@ def test_numbered_duplicates_empty_list():
102102
result = _numbered_duplicates([])
103103
assert result == []
104104

105-
106105
def test_json_to_dict_empty_input():
107106
result = _json_to_dict({})
108107
assert result == {}
@@ -114,7 +113,6 @@ def sample_dict():
114113
return {
115114
"url": "https://example.com/",
116115
"errors": [],
117-
"jsonld_errors": [],
118116
"title": "Example Domain",
119117
"h1": "Example Domain",
120118
"h2": "Sub‑heading",
@@ -138,11 +136,11 @@ def test_filterdict_returns_original_when_no_filters(sample_dict):
138136
@pytest.mark.parametrize(
139137
"patterns, expected_keys",
140138
[
141-
(["^title$"], {"title", "url", "errors", "jsonld_errors"}),
142-
(["^h\\d$"], {"h1", "h2", "url", "errors", "jsonld_errors"}),
139+
(["^title$"], {"title", "url", "errors"}),
140+
(["^h\\d$"], {"h1", "h2", "url", "errors"}),
143141
(
144142
["(title|meta_description)"],
145-
{"title", "meta_description", "url", "errors", "jsonld_errors"},
143+
{"title", "meta_description", "url", "errors"},
146144
),
147145
],
148146
)
@@ -162,7 +160,7 @@ def test_filterdict_keep_only(sample_dict, patterns, expected_keys):
162160
def test_filterdict_discard_only(sample_dict, patterns, forbidden_keys):
163161
result = _filter_crawl_dict(sample_dict, discard_columns=patterns)
164162
assert not (set(result) & set(forbidden_keys))
165-
for k in {"url", "errors", "jsonld_errors"}:
163+
for k in {"url", "errors"}:
166164
assert k in result
167165

168166

@@ -172,14 +170,14 @@ def test_filterdict_discard_overrides_keep(sample_dict):
172170
result = _filter_crawl_dict(sample_dict, keep, discard)
173171
assert "h1" in result
174172
assert "h2" not in result
175-
for k in {"url", "errors", "jsonld_errors"}:
173+
for k in {"url", "errors"}:
176174
assert k in result
177175

178176

179-
@pytest.mark.parametrize("pattern", [r"url", r"errors", r"jsonld"])
177+
@pytest.mark.parametrize("pattern", [r"url", r"errors"])
180178
def test_filterdict_always_include_never_dropped(sample_dict, pattern):
181179
result = _filter_crawl_dict(sample_dict, discard_columns=[pattern])
182-
for k in {"url", "errors", "jsonld_errors"}:
180+
for k in {"url", "errors"}:
183181
assert k in result
184182

185183

@@ -190,3 +188,34 @@ def test_filterdict_empty_input_dict_returns_empty():
190188
def test_filterdict_invalid_regex_raises(sample_dict):
191189
with pytest.raises(re.error):
192190
_filter_crawl_dict(sample_dict, keep_columns=["["]) # invalid pattern
191+
192+
193+
def test_filterdict_jsonld_errors_can_be_filtered():
194+
"""Test that jsonld_errors can now be filtered out since it's not always included."""
195+
sample_with_jsonld = {
196+
"url": "https://example.com/",
197+
"errors": [],
198+
"title": "Example Domain",
199+
"jsonld_errors": ["Some JSON-LD error"],
200+
"custom_key": "value",
201+
}
202+
203+
result = _filter_crawl_dict(sample_with_jsonld, discard_columns=["jsonld_errors"])
204+
assert "jsonld_errors" not in result
205+
assert "url" in result # always included
206+
assert "errors" in result # always included
207+
208+
result2 = _filter_crawl_dict(sample_with_jsonld, keep_columns=["jsonld_errors"])
209+
assert set(result2.keys()) == {"url", "errors", "jsonld_errors"}
210+
211+
212+
def test_filterdict_jsonld_errors_not_always_present():
213+
"""Test that dictionaries without jsonld_errors work correctly."""
214+
sample_without_jsonld = {
215+
"url": "https://example.com/",
216+
"errors": [],
217+
"title": "Example Domain",
218+
}
219+
220+
result = _filter_crawl_dict(sample_without_jsonld, keep_columns=["title"])
221+
assert set(result.keys()) == {"url", "errors", "title"}

0 commit comments

Comments
 (0)