Skip to content

Commit ebf5755

Browse files
authored
fix: JSON handling with Parsel (#490)
- closes #488 Thanks @Ehsan-U for reporting this!
1 parent 2be7454 commit ebf5755

File tree

2 files changed

+52
-1
lines changed

2 files changed

+52
-1
lines changed

src/crawlee/parsel_crawler/_parsel_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,12 @@ async def _handle_blocked_request(
8989
if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
9090
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
9191

92+
parsel = crawling_context.selector
93+
9294
matched_selectors = [
9395
selector
9496
for selector in RETRY_CSS_SELECTORS
95-
if crawling_context.selector.css(selector).get() is not None
97+
if parsel.type in ('html', 'xml') and parsel.css(selector).get() is not None
9698
]
9799

98100
if matched_selectors:

tests/unit/parsel_crawler/test_parsel_crawler.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,21 @@ async def server() -> AsyncGenerator[respx.MockRouter, None]:
7171
</html>""",
7272
)
7373

74+
mock.get('/json', name='json_endpoint').return_value = Response(
75+
200,
76+
text="""{
77+
"hello": "world"
78+
}""",
79+
)
80+
81+
mock.get('/xml', name='xml_endpoint').return_value = Response(
82+
200,
83+
text="""
84+
<?xml version="1.0"?>
85+
<hello>world</hello>
86+
""",
87+
)
88+
7489
generic_response = Response(
7590
200,
7691
text="""<html>
@@ -225,3 +240,37 @@ def test_import_error_handled() -> None:
225240
"To import anything from this subpackage, you need to install the 'parsel' extra."
226241
"For example, if you use pip, run `pip install 'crawlee[parsel]'`."
227242
)
243+
244+
245+
async def test_json(server: respx.MockRouter) -> None:
246+
crawler = ParselCrawler(request_provider=RequestList(['https://test.io/json']))
247+
handler = mock.AsyncMock()
248+
249+
@crawler.router.default_handler
250+
async def request_handler(context: ParselCrawlingContext) -> None:
251+
result = context.selector.jmespath('hello').getall()
252+
await handler(result)
253+
254+
await crawler.run()
255+
256+
assert server['json_endpoint'].called
257+
assert handler.called
258+
259+
assert handler.call_args[0][0] == ['world']
260+
261+
262+
async def test_xml(server: respx.MockRouter) -> None:
263+
crawler = ParselCrawler(request_provider=RequestList(['https://test.io/xml']))
264+
handler = mock.AsyncMock()
265+
266+
@crawler.router.default_handler
267+
async def request_handler(context: ParselCrawlingContext) -> None:
268+
result = context.selector.css('hello').getall()
269+
await handler(result)
270+
271+
await crawler.run()
272+
273+
assert server['xml_endpoint'].called
274+
assert handler.called
275+
276+
assert handler.call_args[0][0] == ['<hello>world</hello>']

0 commit comments

Comments
 (0)