Skip to content

Commit 6dd546e

Browse files
committed
tests: add Scrapy integration test
1 parent 6364cad commit 6dd546e

File tree

1 file changed

+144
-0
lines changed

1 file changed

+144
-0
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
import pytest
6+
7+
if TYPE_CHECKING:
8+
from .conftest import MakeActorFunction, RunActorFunction
9+
10+
11+
@pytest.mark.only
12+
async def test_actor_scrapy_title_spider(
13+
make_actor: MakeActorFunction,
14+
run_actor: RunActorFunction,
15+
) -> None:
16+
actor_source_files = {
17+
'requirements.txt': """
18+
nest-asyncio ~= 1.6
19+
scrapy ~= 2.12
20+
""",
21+
'src/spiders/title.py': """
22+
from __future__ import annotations
23+
from typing import TYPE_CHECKING, Generator
24+
from scrapy import Request, Spider
25+
from ..items import TitleItem
26+
if TYPE_CHECKING:
27+
from scrapy.responsetypes import Response
28+
29+
class TitleSpider(Spider):
30+
name = 'title_spider'
31+
start_urls = ['https://crawlee.dev/']
32+
33+
def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
34+
self.logger.info('TitleSpider is parsing %s...', response)
35+
url = response.url
36+
title = response.css('title::text').get().strip()
37+
yield TitleItem(url=url, title=title)
38+
""",
39+
'src/spiders/__init__.py': """
40+
from .title import TitleSpider
41+
""",
42+
'src/items.py': """
43+
import scrapy
44+
45+
class TitleItem(scrapy.Item):
46+
url = scrapy.Field
47+
title = scrapy.Field()
48+
""",
49+
'src/settings.py': """
50+
BOT_NAME = 'title_scraper'
51+
LOG_LEVEL = 'INFO'
52+
SPIDER_MODULES = ['src.spiders']
53+
NEWSPIDER_MODULE = 'src.spiders'
54+
ROBOTSTXT_OBEY = True
55+
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
56+
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
57+
FEED_EXPORT_ENCODING = 'utf-8'
58+
""",
59+
'src/__init__.py': '',
60+
'src/main.py': """
61+
from __future__ import annotations
62+
from scrapy.crawler import CrawlerProcess
63+
from apify import Actor
64+
from apify.scrapy.utils import apply_apify_settings
65+
from .spiders.title import TitleSpider as Spider
66+
67+
async def main() -> None:
68+
async with Actor:
69+
Actor.log.info('Actor is being executed...')
70+
request_queue = await Actor.open_request_queue()
71+
await request_queue.add_request('https://crawlee.dev/')
72+
settings = apply_apify_settings()
73+
process = CrawlerProcess(settings, install_root_handler=False)
74+
process.crawl(Spider)
75+
process.start()
76+
""",
77+
'src/__main__.py': """
78+
from __future__ import annotations
79+
from logging import StreamHandler, getLogger
80+
from typing import Any
81+
from scrapy.utils import log as scrapy_logging
82+
from scrapy.utils.project import get_project_settings
83+
from apify.log import ActorLogFormatter
84+
85+
MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
86+
OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
87+
ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
88+
89+
settings = get_project_settings()
90+
LOGGING_LEVEL = settings['LOG_LEVEL']
91+
92+
apify_handler = StreamHandler()
93+
apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
94+
95+
96+
def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
97+
logger = getLogger(logger_name)
98+
logger.setLevel(log_level)
99+
logger.handlers = []
100+
101+
for handler in handlers:
102+
logger.addHandler(handler)
103+
104+
105+
for logger_name in MAIN_LOGGER_NAMES:
106+
configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
107+
108+
old_configure_logging = scrapy_logging.configure_logging
109+
110+
111+
def new_configure_logging(*args: Any, **kwargs: Any) -> None:
112+
old_configure_logging(*args, **kwargs)
113+
configure_logger(None, LOGGING_LEVEL, apify_handler)
114+
for logger_name in ALL_LOGGER_NAMES:
115+
configure_logger(logger_name, LOGGING_LEVEL)
116+
configure_logger('httpx', 'WARNING')
117+
118+
119+
scrapy_logging.configure_logging = new_configure_logging
120+
121+
import asyncio
122+
import os
123+
import nest_asyncio
124+
from scrapy.utils.reactor import install_reactor
125+
from .main import main
126+
127+
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
128+
nest_asyncio.apply()
129+
130+
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
131+
132+
asyncio.run(main())
133+
""",
134+
}
135+
136+
actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files)
137+
run_result = await run_actor(actor)
138+
139+
assert run_result.status == 'SUCCEEDED'
140+
141+
items = await actor.last_run().dataset().list_items()
142+
143+
assert items.count == 1
144+
assert items.items == {'blah'}

0 commit comments

Comments
 (0)