From 8d40e40da60a282c8ea8dd0dd71b09cc0a0dc9f7 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 6 Feb 2025 15:42:59 +0100 Subject: [PATCH 01/16] fix: fix Scrapy integration --- docs/02_guides/05_scrapy.mdx | 13 +- .../code/_scrapy_project/.actor/Dockerfile | 22 ++ .../code/_scrapy_project/.actor/actor.json | 13 ++ .../_scrapy_project/.actor/input_schema.json | 33 +++ .../code/_scrapy_project/.dockerignore | 155 +++++++++++++ .../02_guides/code/_scrapy_project/.gitignore | 154 +++++++++++++ docs/02_guides/code/_scrapy_project/README.md | 32 +++ .../code/_scrapy_project/requirements.txt | 5 + .../02_guides/code/_scrapy_project/scrapy.cfg | 5 + .../src}/__init__.py | 0 .../code/_scrapy_project/src/__main__.py | 31 +++ .../src}/items.py | 0 .../src}/main.py | 40 ++-- .../code/_scrapy_project/src/middlewares.py | 129 +++++++++++ .../code/_scrapy_project/src/pipelines.py | 21 ++ .../src}/py.typed | 0 .../code/_scrapy_project/src/settings.py | 26 +++ .../_scrapy_project/src/spiders/__init__.py | 10 + .../src}/spiders/py.typed | 0 .../src}/spiders/title.py | 35 ++- docs/02_guides/code/scrapy_src/__main__.py | 121 ---------- docs/02_guides/code/scrapy_src/settings.py | 15 -- .../code/scrapy_src/spiders/__init__.py | 0 src/apify/_actor.py | 12 +- src/apify/scrapy/__init__.py | 10 +- src/apify/scrapy/_actor_runner.py | 32 +++ src/apify/scrapy/_logging_config.py | 61 +++++ src/apify/scrapy/middlewares/apify_proxy.py | 4 +- src/apify/scrapy/scheduler.py | 209 +++++++++++++----- src/apify/scrapy/utils.py | 3 - tests/integration/test_actor_scrapy.py | 137 ++++++++++++ 31 files changed, 1085 insertions(+), 243 deletions(-) create mode 100644 docs/02_guides/code/_scrapy_project/.actor/Dockerfile create mode 100644 docs/02_guides/code/_scrapy_project/.actor/actor.json create mode 100644 docs/02_guides/code/_scrapy_project/.actor/input_schema.json create mode 100644 docs/02_guides/code/_scrapy_project/.dockerignore create mode 100644 docs/02_guides/code/_scrapy_project/.gitignore create mode 100644 docs/02_guides/code/_scrapy_project/README.md create mode 100644 docs/02_guides/code/_scrapy_project/requirements.txt create mode 100644 docs/02_guides/code/_scrapy_project/scrapy.cfg rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/__init__.py (100%) create mode 100644 docs/02_guides/code/_scrapy_project/src/__main__.py rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/items.py (100%) rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/main.py (62%) create mode 100644 docs/02_guides/code/_scrapy_project/src/middlewares.py create mode 100644 docs/02_guides/code/_scrapy_project/src/pipelines.py rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/py.typed (100%) create mode 100644 docs/02_guides/code/_scrapy_project/src/settings.py create mode 100644 docs/02_guides/code/_scrapy_project/src/spiders/__init__.py rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/spiders/py.typed (100%) rename docs/02_guides/code/{scrapy_src => _scrapy_project/src}/spiders/title.py (53%) delete mode 100644 docs/02_guides/code/scrapy_src/__main__.py delete mode 100644 docs/02_guides/code/scrapy_src/settings.py delete mode 100644 docs/02_guides/code/scrapy_src/spiders/__init__.py create mode 100644 src/apify/scrapy/_actor_runner.py create mode 100644 src/apify/scrapy/_logging_config.py create mode 100644 tests/integration/test_actor_scrapy.py diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 98526e65..2b89ac04 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -7,11 +7,11 @@ import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import UnderscoreMainExample from '!!raw-loader!./code/scrapy_src/__main__.py'; -import MainExample from '!!raw-loader!./code/scrapy_src/main.py'; -import ItemsExample from '!!raw-loader!./code/scrapy_src/items.py'; -import SettingsExample from '!!raw-loader!./code/scrapy_src/settings.py'; -import TitleSpiderExample from '!!raw-loader!./code/scrapy_src/spiders/title.py'; +import UnderscoreMainExample from '!!raw-loader!./code/_scrapy_project/src/__main__.py'; +import MainExample from '!!raw-loader!./code/_scrapy_project/src/main.py'; +import ItemsExample from '!!raw-loader!./code/_scrapy_project/src/items.py'; +import SettingsExample from '!!raw-loader!./code/_scrapy_project/src/settings.py'; +import TitleSpiderExample from '!!raw-loader!./code/_scrapy_project/src/spiders/title.py'; [Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation. @@ -92,5 +92,4 @@ Here is an example of a Scrapy Actor that scrapes the titles of web pages and en ## Conclusion -In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects -using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/code/_scrapy_project/.actor/Dockerfile b/docs/02_guides/code/_scrapy_project/.actor/Dockerfile new file mode 100644 index 00000000..f486156c --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/.actor/Dockerfile @@ -0,0 +1,22 @@ +FROM apify/actor-python:3.12 + +COPY pyproject.toml ./ + +RUN echo "Python version:" \ + && python --version \ + && echo "Pip version:" \ + && pip --version \ + && echo "Installing Poetry:" \ + && pip install --no-cache-dir poetry~=1.8.0 \ + && echo "Installing dependencies:" \ + && poetry config virtualenvs.create false \ + && poetry install --only main --no-interaction --no-ansi \ + && rm -rf /tmp/.poetry-cache \ + && echo "All installed Python packages:" \ + && pip freeze + +COPY . ./ + +RUN python3 -m compileall -q . + +CMD ["python3", "-m", "src"] diff --git a/docs/02_guides/code/_scrapy_project/.actor/actor.json b/docs/02_guides/code/_scrapy_project/.actor/actor.json new file mode 100644 index 00000000..418b0ffe --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/.actor/actor.json @@ -0,0 +1,13 @@ +{ + "actorSpecification": 1, + "name": "getting-started-python-scrapy", + "title": "Getting started with Python and Scrapy", + "description": "Scrapes titles of websites using Scrapy.", + "version": "0.0", + "buildTag": "latest", + "meta": { + "templateId": "python-scrapy" + }, + "input": "./input_schema.json", + "dockerfile": "./Dockerfile" +} diff --git a/docs/02_guides/code/_scrapy_project/.actor/input_schema.json b/docs/02_guides/code/_scrapy_project/.actor/input_schema.json new file mode 100644 index 00000000..6714b865 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/.actor/input_schema.json @@ -0,0 +1,33 @@ +{ + "title": "Python Scrapy Scraper", + "type": "object", + "schemaVersion": 1, + "properties": { + "startUrls": { + "title": "Start URLs", + "type": "array", + "description": "URLs to start with", + "editor": "requestListSources", + "prefill": [{ "url": "https://crawlee.dev/" }], + "default": [{ "url": "https://crawlee.dev/" }] + }, + "allowedDomains": { + "title": "Allowed domains", + "type": "array", + "description": "Domains that the scraper is allowed to crawl.", + "editor": "json", + "prefill": ["crawlee.dev"], + "default": ["crawlee.dev"] + }, + "proxyConfiguration": { + "sectionCaption": "Proxy and HTTP configuration", + "title": "Proxy configuration", + "type": "object", + "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", + "editor": "proxy", + "prefill": { "useApifyProxy": false }, + "default": { "useApifyProxy": false } + } + }, + "required": ["startUrls"] +} diff --git a/docs/02_guides/code/_scrapy_project/.dockerignore b/docs/02_guides/code/_scrapy_project/.dockerignore new file mode 100644 index 00000000..6eb49d35 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/.dockerignore @@ -0,0 +1,155 @@ +.git +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/docs/02_guides/code/_scrapy_project/.gitignore b/docs/02_guides/code/_scrapy_project/.gitignore new file mode 100644 index 00000000..f4ce363f --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/.gitignore @@ -0,0 +1,154 @@ +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/docs/02_guides/code/_scrapy_project/README.md b/docs/02_guides/code/_scrapy_project/README.md new file mode 100644 index 00000000..fb7b8d7f --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/README.md @@ -0,0 +1,32 @@ +## Python Scrapy template + +A template example built with Scrapy to scrape page titles from URLs defined in the input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results. + +## Included features + +- **[Apify SDK](https://docs.apify.com/sdk/python/)** for Python - a toolkit for building Apify [Actors](https://apify.com/actors) and scrapers in Python +- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input +- **[Request queue](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-request-queues)** - queues into which you can put the URLs you want to scrape +- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes +- **[Scrapy](https://scrapy.org/)** - a fast high-level web scraping framework + +## How it works + +This code is a Python script that uses Scrapy to scrape web pages and extract data from them. Here's a brief overview of how it works: + +- The script reads the input data from the Actor instance, which is expected to contain a `start_urls` key with a list of URLs to scrape. +- The script then creates a Scrapy spider that will scrape the URLs. This Spider (class `TitleSpider`) is storing URLs and titles. +- Scrapy pipeline is used to save the results to the default dataset associated with the Actor run using the `push_data` method of the Actor instance. +- The script catches any exceptions that occur during the [web scraping](https://apify.com/web-scraping) process and logs an error message using the `Actor.log.exception` method. + +## Resources + +- [Web scraping with Scrapy](https://blog.apify.com/web-scraping-with-scrapy/) +- [Python tutorials in Academy](https://docs.apify.com/academy/python) +- [Alternatives to Scrapy for web scraping in 2023](https://blog.apify.com/alternatives-scrapy-web-scraping/) +- [Beautiful Soup vs. Scrapy for web scraping](https://blog.apify.com/beautiful-soup-vs-scrapy-web-scraping/) +- [Integration with Zapier](https://apify.com/integrations), Make, Google Drive, and others +- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM) +- A short guide on how to build web scrapers using code templates: + +[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w) diff --git a/docs/02_guides/code/_scrapy_project/requirements.txt b/docs/02_guides/code/_scrapy_project/requirements.txt new file mode 100644 index 00000000..8d2de65b --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/requirements.txt @@ -0,0 +1,5 @@ +# Feel free to add your Python dependencies below. For formatting guidelines, see: +# https://pip.pypa.io/en/latest/reference/requirements-file-format/ + +apify[scrapy] < 3.0 +scrapy ~= 2.12 diff --git a/docs/02_guides/code/_scrapy_project/scrapy.cfg b/docs/02_guides/code/_scrapy_project/scrapy.cfg new file mode 100644 index 00000000..da962db6 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/scrapy.cfg @@ -0,0 +1,5 @@ +[settings] +default = src.settings + +[deploy] +project = src diff --git a/docs/02_guides/code/scrapy_src/__init__.py b/docs/02_guides/code/_scrapy_project/src/__init__.py similarity index 100% rename from docs/02_guides/code/scrapy_src/__init__.py rename to docs/02_guides/code/_scrapy_project/src/__init__.py diff --git a/docs/02_guides/code/_scrapy_project/src/__main__.py b/docs/02_guides/code/_scrapy_project/src/__main__.py new file mode 100644 index 00000000..aaba43c3 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/src/__main__.py @@ -0,0 +1,31 @@ +"""Apify Actor integration for Scrapy projects. + +This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's +logging system, and establishing the required environment to run the Scrapy spider within the Apify platform. + +This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally +or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using +`scrapy crawl title_spider`. + +We recommend you do not modify this file unless you really know what you are doing. +""" +# ruff: noqa: E402, I001 + +from __future__ import annotations +import asyncio +from twisted.internet import asyncioreactor + +# Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. +asyncioreactor.install(asyncio.get_event_loop()) # type: ignore[no-untyped-call] + +import os +from apify.scrapy import initialize_logging, run_scrapy_actor +from .main import main + +# Ensure the location to the Scrapy settings module is defined. +os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' + + +if __name__ == '__main__': + initialize_logging() + run_scrapy_actor(main()) diff --git a/docs/02_guides/code/scrapy_src/items.py b/docs/02_guides/code/_scrapy_project/src/items.py similarity index 100% rename from docs/02_guides/code/scrapy_src/items.py rename to docs/02_guides/code/_scrapy_project/src/items.py diff --git a/docs/02_guides/code/scrapy_src/main.py b/docs/02_guides/code/_scrapy_project/src/main.py similarity index 62% rename from docs/02_guides/code/scrapy_src/main.py rename to docs/02_guides/code/_scrapy_project/src/main.py index 1a878c5b..c593435e 100644 --- a/docs/02_guides/code/scrapy_src/main.py +++ b/docs/02_guides/code/_scrapy_project/src/main.py @@ -1,4 +1,4 @@ -"""This module defines the main entry point for the Apify Actor. +"""Main entry point for the Apify Actor & Scrapy integration. This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by @@ -18,43 +18,37 @@ For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy. """ +# ruff: noqa: I001 from __future__ import annotations -from scrapy.crawler import CrawlerProcess +from scrapy.crawler import CrawlerRunner +from scrapy.utils.defer import deferred_to_future -# Import your Scrapy spider here. -from .spiders.title import TitleSpider as Spider from apify import Actor from apify.scrapy.utils import apply_apify_settings -# Default input values for local execution using `apify run`. -LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}] +# Import your Scrapy spider here. +from .spiders.title import TitleSpider as Spider async def main() -> None: """Apify Actor main coroutine for executing the Scrapy spider.""" - # Enter the context of the Actor. async with Actor: - Actor.log.info('Actor is being executed...') - # Retrieve and process Actor input. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS) + start_urls = [url['url'] for url in actor_input.get('startUrls', [])] + allowed_domains = actor_input.get('allowedDomains') proxy_config = actor_input.get('proxyConfiguration') - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. - for start_url in start_urls: - url = start_url.get('url') - await request_queue.add_request(url) - - # Apply Apify settings, it will override the Scrapy project settings. + # Apply Apify settings, which will override the Scrapy project settings. settings = apply_apify_settings(proxy_config=proxy_config) - # Execute the spider using Scrapy `CrawlerProcess`. - process = CrawlerProcess(settings, install_root_handler=False) - process.crawl(Spider) - process.start() + # Create CrawlerRunner and execute the Scrapy spider. + crawler_runner = CrawlerRunner(settings) + crawl_deferred = crawler_runner.crawl( + Spider, + start_urls=start_urls, + allowed_domains=allowed_domains, + ) + await deferred_to_future(crawl_deferred) diff --git a/docs/02_guides/code/_scrapy_project/src/middlewares.py b/docs/02_guides/code/_scrapy_project/src/middlewares.py new file mode 100644 index 00000000..8fea4184 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/src/middlewares.py @@ -0,0 +1,129 @@ +"""Scrapy middlewares module. + +This module defines Scrapy middlewares. Middlewares are processing components that handle requests and +responses, typically used for adding custom headers, retrying requests, and handling exceptions. + +There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information +on creating and utilizing them, refer to the official documentation: +https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" +# ruff: noqa: ARG002, UP028 + +from __future__ import annotations + +from typing import TYPE_CHECKING + +# Useful for handling different item types with a single interface +from scrapy import Request, Spider, signals + +if TYPE_CHECKING: + from collections.abc import Generator, Iterable + + from scrapy.crawler import Crawler + from scrapy.http import Response + + +class TitleSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware: + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response: Response, spider: Spider) -> None: + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output( + self, + response: Response, + result: Iterable, + spider: Spider, + ) -> Generator[Iterable[Request] | None, None, None]: + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception( + self, + response: Response, + exception: BaseException, + spider: Spider, + ) -> Iterable[Request] | None: + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests( + self, start_requests: Iterable[Request], spider: Spider + ) -> Iterable[Request]: # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn't have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider: Spider) -> None: + pass + + +class TitleDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware: + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request: Request, spider: Spider) -> Request | Response | None: + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response: + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None: + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider: Spider) -> None: + pass diff --git a/docs/02_guides/code/_scrapy_project/src/pipelines.py b/docs/02_guides/code/_scrapy_project/src/pipelines.py new file mode 100644 index 00000000..7a1c9e8b --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/src/pipelines.py @@ -0,0 +1,21 @@ +"""Scrapy item pipelines module. + +This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components +that handle the scraped items, typically used for cleaning, validating, and persisting data. + +For detailed information on creating and utilizing item pipelines, refer to the official documentation: +http://doc.scrapy.org/en/latest/topics/item-pipeline.html +""" +# ruff: noqa: ARG002 + +from scrapy import Spider + +from .items import TitleItem + + +class TitleItemPipeline: + """This item pipeline defines processing steps for TitleItem objects scraped by spiders.""" + + def process_item(self, item: TitleItem, spider: Spider) -> TitleItem: + # Do something with the item here, such as cleaning it or persisting it to a database + return item diff --git a/docs/02_guides/code/scrapy_src/py.typed b/docs/02_guides/code/_scrapy_project/src/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_src/py.typed rename to docs/02_guides/code/_scrapy_project/src/py.typed diff --git a/docs/02_guides/code/_scrapy_project/src/settings.py b/docs/02_guides/code/_scrapy_project/src/settings.py new file mode 100644 index 00000000..f3f0b696 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/src/settings.py @@ -0,0 +1,26 @@ +"""Scrapy settings module. + +This module contains Scrapy settings for the project, defining various configurations and options. + +For more comprehensive details on Scrapy settings, refer to the official documentation: +http://doc.scrapy.org/en/latest/topics/settings.html +""" + +BOT_NAME = 'titlebot' +DEPTH_LIMIT = 1 +LOG_LEVEL = 'INFO' +NEWSPIDER_MODULE = 'src.spiders' +ROBOTSTXT_OBEY = True +SPIDER_MODULES = ['src.spiders'] +TELNETCONSOLE_ENABLED = False +TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' + +ITEM_PIPELINES = { + 'src.pipelines.TitleItemPipeline': 123, +} +SPIDER_MIDDLEWARES = { + 'src.middlewares.TitleSpiderMiddleware': 543, +} +DOWNLOADER_MIDDLEWARES = { + 'src.middlewares.TitleDownloaderMiddleware': 543, +} diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py b/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py new file mode 100644 index 00000000..3a286fc6 --- /dev/null +++ b/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py @@ -0,0 +1,10 @@ +"""Scrapy spiders package. + +This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape +and process data from websites. + +For detailed information on creating and utilizing spiders, refer to the official documentation: +https://docs.scrapy.org/en/latest/topics/spiders.html +""" + +from .title import TitleSpider diff --git a/docs/02_guides/code/scrapy_src/spiders/py.typed b/docs/02_guides/code/_scrapy_project/src/spiders/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_src/spiders/py.typed rename to docs/02_guides/code/_scrapy_project/src/spiders/py.typed diff --git a/docs/02_guides/code/scrapy_src/spiders/title.py b/docs/02_guides/code/_scrapy_project/src/spiders/title.py similarity index 53% rename from docs/02_guides/code/scrapy_src/spiders/title.py rename to docs/02_guides/code/_scrapy_project/src/spiders/title.py index 7be37b68..b92ccb86 100644 --- a/docs/02_guides/code/scrapy_src/spiders/title.py +++ b/docs/02_guides/code/_scrapy_project/src/spiders/title.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from urllib.parse import urljoin from scrapy import Request, Spider @@ -16,19 +16,32 @@ class TitleSpider(Spider): - """Scrapes title pages and enqueues all links found on the page.""" + """A spider that scrapes web pages to extract titles and discover new links. - name = 'title_spider' + This spider retrieves the content of the element from each page and queues + any valid hyperlinks for further crawling. + """ - # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input - # when the project is executed using Apify. - start_urls = ['https://apify.com/'] + name = 'title_spider' - # Scrape only the pages within the Apify domain. - allowed_domains = ['apify.com'] + def __init__( + self, + start_urls: list[str], + allowed_domains: list[str], + *args: Any, + **kwargs: Any, + ) -> None: + """A default costructor. - # Limit the number of pages to scrape. - custom_settings = {'CLOSESPIDER_PAGECOUNT': 10} + Args: + start_urls: URLs to start the scraping from. + allowed_domains: Domains that the scraper is allowed to crawl. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + super().__init__(*args, **kwargs) + self.start_urls = start_urls + self.allowed_domains = allowed_domains def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: """Parse the web page response. @@ -46,7 +59,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None title = response.css('title::text').extract_first() yield TitleItem(url=url, title=title) - # Extract all links from the page, create Requests out of them, and yield them + # Extract all links from the page, create `Request` objects out of them, and yield them. for link_href in response.css('a::attr("href")'): link_url = urljoin(response.url, link_href.get()) if link_url.startswith(('http://', 'https://')): diff --git a/docs/02_guides/code/scrapy_src/__main__.py b/docs/02_guides/code/scrapy_src/__main__.py deleted file mode 100644 index 56d477dd..00000000 --- a/docs/02_guides/code/scrapy_src/__main__.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Apify Actor integration for Scrapy projects. - -This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's -logging system, and establishing the required environment to run the Scrapy spider within the Apify platform. - -This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally -or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using -`scrapy crawl title_spider`. - -We recommend you do not modify this file unless you really know what you are doing. -""" - -# ruff: noqa: E402 - -# We need to configure the logging first before we import anything else, so that nothing else imports -# `scrapy.utils.log` before we patch it. -from __future__ import annotations - -from logging import StreamHandler, getLogger -from typing import Any - -from scrapy.utils import log as scrapy_logging -from scrapy.utils.project import get_project_settings - -from apify.log import ActorLogFormatter - -# Define names of the loggers. -MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy'] -OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted'] -ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES - -# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, -# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for -# a specific logger, do it in this file. -settings = get_project_settings() -LOGGING_LEVEL = settings['LOG_LEVEL'] - -# Define a logging handler which will be used for the loggers. -apify_handler = StreamHandler() -apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True)) - - -def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None: - """Configure a logger with the specified settings. - - Args: - logger_name: The name of the logger to be configured. - log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...). - handlers: Optional list of logging handlers. - """ - logger = getLogger(logger_name) - logger.setLevel(log_level) - logger.handlers = [] - - for handler in handlers: - logger.addHandler(handler) - - -# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from -# the `main.py` and Scrapy components. -for logger_name in MAIN_LOGGER_NAMES: - configure_logger(logger_name, LOGGING_LEVEL, apify_handler) - -# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` -# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though -# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method -# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because -# otherwise we would lose some log messages. -old_configure_logging = scrapy_logging.configure_logging - - -def new_configure_logging(*args: Any, **kwargs: Any) -> None: - """Configure logging for Scrapy and root loggers to ensure consistent logging behavior. - - We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root - logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary - loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here - these four loggers and the root logger. - """ - old_configure_logging(*args, **kwargs) - - # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger` - # property within spiders. See details in the Spider logger property: - # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46. - configure_logger(None, LOGGING_LEVEL, apify_handler) - - # We modify other loggers only by setting up their log level. A custom log handler is added - # only to the root logger to avoid duplicate log messages. - for logger_name in ALL_LOGGER_NAMES: - configure_logger(logger_name, LOGGING_LEVEL) - - # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless - # messages, especially when running on the platform. - configure_logger('httpx', 'WARNING') - - -scrapy_logging.configure_logging = new_configure_logging - -# Now we can do the rest of the setup. -import asyncio -import os - -import nest_asyncio -from scrapy.utils.reactor import install_reactor - -from .main import main - -# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is -# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries -# to work together. -# -# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly -# on Windows. -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') -nest_asyncio.apply() - -# Specify the path to the Scrapy project settings module. -os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' - -# Run the Apify main coroutine in the event loop. -asyncio.run(main()) diff --git a/docs/02_guides/code/scrapy_src/settings.py b/docs/02_guides/code/scrapy_src/settings.py deleted file mode 100644 index 8a0fd3e6..00000000 --- a/docs/02_guides/code/scrapy_src/settings.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Scrapy settings module. - -This module contains Scrapy settings for the project, defining various configurations and options. - -For more comprehensive details on Scrapy settings, refer to the official documentation: -http://doc.scrapy.org/en/latest/topics/settings.html -""" - -BOT_NAME = 'titlebot' -DEPTH_LIMIT = 1 -LOG_LEVEL = 'INFO' -NEWSPIDER_MODULE = 'spiders' -REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' -ROBOTSTXT_OBEY = True -SPIDER_MODULES = ['spiders'] diff --git a/docs/02_guides/code/scrapy_src/spiders/__init__.py b/docs/02_guides/code/scrapy_src/spiders/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 71e3b6e2..acb03885 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -35,6 +35,13 @@ from apify.log import _configure_logging, logger from apify.storages import Dataset, KeyValueStore, RequestQueue +try: + import scrapy # noqa: F401 + + scrapy_installed = True +except ImportError: + scrapy_installed = False + if TYPE_CHECKING: import logging from types import TracebackType @@ -270,8 +277,9 @@ async def finalize() -> None: self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') - elif hasattr(asyncio, '_nest_patched'): - self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in a nested event loop') + elif scrapy_installed: + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy') + # Otherwise, it will just freeze. else: sys.exit(exit_code) diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 3a665c99..55b88076 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,11 +1,15 @@ -from apify.scrapy.requests import to_apify_request, to_scrapy_request -from apify.scrapy.scheduler import ApifyScheduler -from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id +from ._actor_runner import run_scrapy_actor +from ._logging_config import initialize_logging +from .requests import to_apify_request, to_scrapy_request +from .scheduler import ApifyScheduler +from .utils import get_basic_auth_header, get_running_event_loop_id __all__ = [ 'ApifyScheduler', 'get_basic_auth_header', 'get_running_event_loop_id', + 'initialize_logging', + 'run_scrapy_actor', 'to_apify_request', 'to_scrapy_request', ] diff --git a/src/apify/scrapy/_actor_runner.py b/src/apify/scrapy/_actor_runner.py new file mode 100644 index 00000000..696b7a7f --- /dev/null +++ b/src/apify/scrapy/_actor_runner.py @@ -0,0 +1,32 @@ +"""Runner for Apify Actors using Twisted's reactor. + +This module provides functions to run your Actor with Scrapy project inside within the Twisted +reactor by bridging asyncio coroutines with Twisted Deferreds. +""" + +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +from twisted.internet.defer import Deferred, ensureDeferred +from twisted.internet.task import react + +if TYPE_CHECKING: + from collections.abc import Coroutine + + +async def _run_coro_as_deferred(coro: Coroutine) -> None: + """Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred.""" + task = asyncio.ensure_future(coro) + await Deferred.fromFuture(task) + + +def run_scrapy_actor(coro: Coroutine) -> None: + """Start Twisted's reactor and execute the provided Actor coroutine. + + This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the + Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops, + enabling the Apify and Scrapy integration to work together. + """ + react(lambda _: ensureDeferred(_run_coro_as_deferred(coro))) diff --git a/src/apify/scrapy/_logging_config.py b/src/apify/scrapy/_logging_config.py new file mode 100644 index 00000000..13ea8299 --- /dev/null +++ b/src/apify/scrapy/_logging_config.py @@ -0,0 +1,61 @@ +"""Logging configuration for Apify Actor & Scrapy integration. + +This module configures a custom logging system for Apify Actors and monkey-patches Scrapy's logging +to use a Apify log formatter and settings. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from scrapy.utils import log as scrapy_logging +from scrapy.utils.project import get_project_settings + +from apify.log import ActorLogFormatter + +# Define logger names. +_PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy'] +_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted'] +_ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS + + +def _configure_logger(name: str | None, logging_level: str, handler: logging.Handler) -> None: + """Clear and reconfigure the logger.""" + logger = logging.getLogger(name) + logger.handlers.clear() + logger.setLevel(logging_level) + + if name is None: # Root logger. + logger.addHandler(handler) + logger.propagate = False + else: + logger.propagate = True + + +def initialize_logging() -> None: + """Configure logging for Apify Actors and adjust Scrapy's logging settings.""" + # Retrieve Scrapy project settings and determine the logging level. + settings = get_project_settings() + logging_level = settings.get('LOG_LEVEL', 'INFO') # Default to INFO. + + # Create a custom handler with the Apify log formatter. + handler = logging.StreamHandler() + handler.setFormatter(ActorLogFormatter(include_logger_name=True)) + + # Configure the root logger and all other defined loggers. + for logger_name in [None, *_ALL_LOGGERS]: + _configure_logger(logger_name, logging_level, handler) + + # Set the 'httpx' logger to a less verbose level. + logging.getLogger('httpx').setLevel('WARNING') + + # Monkey-patch Scrapy's logging configuration to re-apply our settings. + original_configure_logging = scrapy_logging.configure_logging + + def new_configure_logging(*args: Any, **kwargs: Any) -> None: + original_configure_logging(*args, **kwargs) + for logger_name in [None, *_ALL_LOGGERS]: + _configure_logger(logger_name, logging_level, handler) + + scrapy_logging.configure_logging = new_configure_logging diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index f81be3c4..7d7eaaec 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -51,7 +51,7 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') if proxy_settings is None: - Actor.log.warning( + Actor.log.info( 'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing ' ' in the Actor input.' ) @@ -60,7 +60,7 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify use_apify_proxy = proxy_settings.get('useApifyProxy', False) if use_apify_proxy is not True: - Actor.log.warning( + Actor.log.info( 'ApifyHttpProxyMiddleware is not going to be used. Actor input field ' '"proxyConfiguration.useApifyProxy" is set to False.' ) diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 7d93388f..92676944 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -1,41 +1,110 @@ from __future__ import annotations +import asyncio +import threading import traceback -from typing import TYPE_CHECKING +from concurrent import futures +from logging import getLogger +from typing import TYPE_CHECKING, Any -from crawlee.storage_clients import MemoryStorageClient +from scrapy import Spider +from scrapy.core.scheduler import BaseScheduler +from scrapy.utils.reactor import is_asyncio_reactor_installed -from apify._configuration import Configuration +from crawlee._utils.crypto import crypto_random_object_id + +from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy.requests import to_apify_request, to_scrapy_request +from apify.storages import RequestQueue -try: - from scrapy import Spider - from scrapy.core.scheduler import BaseScheduler - from scrapy.utils.reactor import is_asyncio_reactor_installed +if TYPE_CHECKING: + from collections.abc import Coroutine - if TYPE_CHECKING: - from scrapy.http.request import Request -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', - ) from exc + from scrapy.http.request import Request + from twisted.internet.defer import Deferred -from crawlee._utils.crypto import crypto_random_object_id +logger = getLogger(__name__) -from apify import Actor -from apify.scrapy.requests import to_apify_request, to_scrapy_request -from apify.scrapy.utils import nested_event_loop -from apify.storages import RequestQueue +_TIMEOUT = 60 +"""The timeout for waiting on asyncio coroutines to finish.""" + + +def _start_event_loop(eventloop: asyncio.AbstractEventLoop) -> None: + """Set and run the event loop until it is stopped. + + Args: + eventloop: The asyncio event loop to run. + """ + asyncio.set_event_loop(eventloop) + try: + eventloop.run_forever() + finally: + eventloop.close() + logger.debug('Asyncio event loop has been closed.') + + +def _run_async_coro(eventloop: asyncio.AbstractEventLoop, coro: Coroutine) -> Any: + """Run a coroutine on the given loop in our separate thread, waiting for its result. + + Args: + eventloop: The asyncio event loop to run the coroutine on. + coro: The coroutine to run. + + Returns: + The result of the coroutine. + """ + if not eventloop.is_running(): + logger.warning('Event loop is not running! Ignoring coroutine execution.') + return None + + future = asyncio.run_coroutine_threadsafe(coro, eventloop) + try: + return future.result(timeout=_TIMEOUT) + except futures.TimeoutError as exc: + logger.exception('Coroutine execution timed out.', exc_info=exc) + raise + except Exception as exc: + logger.exception('Coroutine execution raised an exception.', exc_info=exc) + raise + + +async def _shutdown_async_tasks(eventloop: asyncio.AbstractEventLoop) -> None: + """Cancel and wait for all pending tasks on the current event loop. + + Args: + eventloop: The asyncio event loop to cancel tasks on. + """ + tasks = [task for task in asyncio.all_tasks(eventloop) if task is not asyncio.current_task()] + if not tasks: + return + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + + +def _force_exit_event_loop(eventloop: asyncio.AbstractEventLoop, thread: threading.Thread) -> None: + """Forcefully shut down the event loop and its thread. + + Args: + eventloop: The asyncio event loop to stop. + thread: The thread running the event loop. + """ + try: + logger.info('Forced shutdown of the event loop and its thread...') + eventloop.call_soon_threadsafe(eventloop.stop) + thread.join(timeout=5) + except Exception as exc: + logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc) class ApifyScheduler(BaseScheduler): - """A Scrapy scheduler that uses the Apify Request Queue to manage requests. + """A Scrapy scheduler that uses the Apify request queue to manage requests. This scheduler requires the asyncio Twisted reactor to be installed. """ def __init__(self) -> None: - """Create a new instance.""" if not is_asyncio_reactor_installed(): raise ValueError( f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. ' @@ -45,7 +114,12 @@ def __init__(self) -> None: self._rq: RequestQueue | None = None self.spider: Spider | None = None - def open(self, spider: Spider) -> None: # this has to be named "open" + # Create a new event loop and run it in a separate thread. + self._eventloop = asyncio.new_event_loop() + self._thread = threading.Thread(target=lambda: _start_event_loop(self._eventloop), daemon=True) + self._thread.start() + + def open(self, spider: Spider) -> Deferred[None] | None: """Open the scheduler. Args: @@ -53,23 +127,55 @@ def open(self, spider: Spider) -> None: # this has to be named "open" """ self.spider = spider - async def open_queue() -> RequestQueue: + async def open_rq() -> RequestQueue: config = Configuration.get_global_configuration() - - # Use the ApifyStorageClient if the Actor is running on the Apify platform, - # otherwise use the MemoryStorageClient. - storage_client = ( - ApifyStorageClient.from_config(config) if config.is_at_home else MemoryStorageClient.from_config(config) - ) - - return await RequestQueue.open(storage_client=storage_client) + if config.is_at_home: + storage_client = ApifyStorageClient.from_config(config) + return await RequestQueue.open(storage_client=storage_client) + return await RequestQueue.open() try: - self._rq = nested_event_loop.run_until_complete(open_queue()) - except BaseException: + self._rq = _run_async_coro(self._eventloop, open_rq()) + except Exception: traceback.print_exc() raise + return None + + def close(self, reason: str) -> None: + """Close the scheduler. + + Shut down the event loop and its thread gracefully. + + Args: + reason: The reason for closing the spider. + """ + logger.debug(f'Closing {self.__class__.__name__} due to {reason}...') + try: + if self._eventloop.is_running(): + # Cancel all pending tasks in the event loop. + _run_async_coro(self._eventloop, _shutdown_async_tasks(self._eventloop)) + + # Stop the event loop. + self._eventloop.call_soon_threadsafe(self._eventloop.stop) + + # Wait for the event loop thread to exit. + self._thread.join(timeout=_TIMEOUT) + + # If the thread is still alive, execute a forced shutdown. + if self._thread.is_alive(): + logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...') + _force_exit_event_loop(self._eventloop, self._thread) + + except KeyboardInterrupt: + logger.warning('Shutdown interrupted by KeyboardInterrupt!') + + except Exception: + logger.exception('Exception occurred while shutting down.') + + finally: + logger.debug(f'{self.__class__.__name__} closed successfully.') + def has_pending_requests(self) -> bool: """Check if the scheduler has any pending requests. @@ -80,8 +186,8 @@ def has_pending_requests(self) -> bool: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - is_finished = nested_event_loop.run_until_complete(self._rq.is_finished()) - except BaseException: + is_finished = _run_async_coro(self._eventloop, self._rq.is_finished()) + except Exception: traceback.print_exc() raise @@ -106,21 +212,20 @@ def enqueue_request(self, request: Request) -> bool: apify_request = to_apify_request(request, spider=self.spider) if apify_request is None: - Actor.log.error(f'Request {request} was not enqueued because it could not be converted to Apify request.') + Actor.log.error(f'Request {request} could not be converted to Apify request.') return False - Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})') - + Actor.log.debug(f'[{call_id}]: Converted to apify_request: {apify_request}') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') try: - result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request)) - except BaseException: + result = _run_async_coro(self._eventloop, self._rq.add_request(apify_request)) + except Exception: traceback.print_exc() raise - Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...') + Actor.log.debug(f'[{call_id}]: rq.add_request result: {result}') return bool(result.was_already_present) def next_request(self) -> Request | None: @@ -130,39 +235,31 @@ def next_request(self) -> Request | None: The next request, or None if there are no more requests. """ call_id = crypto_random_object_id(8) - Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...') - + Actor.log.debug(f'[{call_id}]: next_request called...') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') - # Fetch the next request from the Request Queue try: - apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request()) - except BaseException: + apify_request = _run_async_coro(self._eventloop, self._rq.fetch_next_request()) + except Exception: traceback.print_exc() raise - Actor.log.debug( - f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})' - ) - + Actor.log.debug(f'[{call_id}]: Fetched apify_request: {apify_request}') if apify_request is None: return None if not isinstance(self.spider, Spider): raise TypeError('self.spider must be an instance of the Spider class') - # Let the Request Queue know that the request is being handled. Every request should be marked as handled, + # Let the request queue know that the request is being handled. Every request should be marked as handled, # retrying is handled by the Scrapy's RetryMiddleware. try: - nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request)) - except BaseException: + _run_async_coro(self._eventloop, self._rq.mark_request_as_handled(apify_request)) + except Exception: traceback.print_exc() raise scrapy_request = to_scrapy_request(apify_request, spider=self.spider) - Actor.log.debug( - f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned ' - f'(scrapy_request={scrapy_request})', - ) + Actor.log.debug(f'[{call_id}]: Converted to scrapy_request: {scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 1f92d4ff..d248a639 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -20,9 +20,6 @@ ) from exc -nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() - - @ignore_docs def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes: """Generate a basic authentication header for the given username and password.""" diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py new file mode 100644 index 00000000..283b1935 --- /dev/null +++ b/tests/integration/test_actor_scrapy.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from .conftest import MakeActorFunction, RunActorFunction + + +@pytest.mark.only +async def test_actor_scrapy_title_spider( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + actor_source_files = { + 'requirements.txt': """ + scrapy ~= 2.12 + """, + 'src/spiders/title.py': """ + from __future__ import annotations + from typing import TYPE_CHECKING, Any + from urllib.parse import urljoin + from scrapy import Request, Spider + from ..items import TitleItem + + if TYPE_CHECKING: + from collections.abc import Generator + from scrapy.responsetypes import Response + + + class TitleSpider(Spider): + name = 'title_spider' + + def __init__( + self, + start_urls: list[str], + allowed_domains: list[str], + *args: Any, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.start_urls = start_urls + self.allowed_domains = allowed_domains + + def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: + self.logger.info('TitleSpider is parsing %s...', response) + url = response.url + title = response.css('title::text').extract_first() + yield TitleItem(url=url, title=title) + + for link_href in response.css('a::attr("href")'): + link_url = urljoin(response.url, link_href.get()) + if link_url.startswith(('http://', 'https://')): + yield Request(link_url) + """, + 'src/spiders/__init__.py': """ + from .title import TitleSpider + """, + 'src/items.py': """ + import scrapy + + class TitleItem(scrapy.Item): + url = scrapy.Field + title = scrapy.Field() + """, + 'src/settings.py': """ + BOT_NAME = 'titlebot' + DEPTH_LIMIT = 1 + LOG_LEVEL = 'INFO' + NEWSPIDER_MODULE = 'src.spiders' + ROBOTSTXT_OBEY = True + SPIDER_MODULES = ['src.spiders'] + TELNETCONSOLE_ENABLED = False + TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' + """, + 'src/__init__.py': '', + 'src/main.py': """ + from __future__ import annotations + from scrapy.crawler import CrawlerRunner + from scrapy.utils.defer import deferred_to_future + from apify import Actor + from apify.scrapy.utils import apply_apify_settings + from .spiders.title import TitleSpider as Spider + + + async def main() -> None: + async with Actor: + Actor.log.info('Actor is being executed...') + + # Retrieve and process Actor input. + start_urls = ['https://crawlee.dev'] + allowed_domains = ['crawlee.dev'] + proxy_config = {'useApifyProxy': True} + + # Apply Apify settings, which will override the Scrapy project settings. + settings = apply_apify_settings(proxy_config=proxy_config) + + # Create CrawlerRunner and execute the Scrapy spider. + crawler_runner = CrawlerRunner(settings) + crawl_deferred = crawler_runner.crawl( + Spider, + start_urls=start_urls, + allowed_domains=allowed_domains, + ) + await deferred_to_future(crawl_deferred) + """, + 'src/__main__.py': """ + from __future__ import annotations + import asyncio + from twisted.internet import asyncioreactor + + # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. + asyncioreactor.install(asyncio.get_event_loop()) + + import os + from apify.scrapy import initialize_logging, run_scrapy_actor + from .main import main + + # Ensure the location to the Scrapy settings module is defined. + os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' + + if __name__ == '__main__': + initialize_logging() + run_scrapy_actor(main()) + """, + } + + actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' + + items = await actor.last_run().dataset().list_items() + + assert items.count == 48 + assert items.items == {'blah'} From 8cc38b465dc84e035f9bed2396998ec8e1263b15 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Thu, 6 Feb 2025 16:15:46 +0100 Subject: [PATCH 02/16] comment out docs check CI step --- .github/workflows/run_code_checks.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml index dd9b4d23..058739da 100644 --- a/.github/workflows/run_code_checks.yaml +++ b/.github/workflows/run_code_checks.yaml @@ -19,9 +19,9 @@ jobs: name: Unit tests uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main - docs_check: - name: Docs check - uses: apify/workflows/.github/workflows/python_docs_check.yaml@main + # docs_check: + # name: Docs check + # uses: apify/workflows/.github/workflows/python_docs_check.yaml@main integration_tests: name: Integration tests From c1eacba1e9239ad623f732d671227d887a5d72a1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Thu, 6 Feb 2025 19:27:25 +0100 Subject: [PATCH 03/16] Scrapy integration test is working --- .../code/_scrapy_project/src/__main__.py | 3 +-- .../code/_scrapy_project/src/spiders/title.py | 3 +++ tests/integration/README.md | 1 - .../actor_source_base/requirements.txt | 1 + tests/integration/test_actor_scrapy.py | 21 +++++++++---------- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/02_guides/code/_scrapy_project/src/__main__.py b/docs/02_guides/code/_scrapy_project/src/__main__.py index aaba43c3..752e9e4e 100644 --- a/docs/02_guides/code/_scrapy_project/src/__main__.py +++ b/docs/02_guides/code/_scrapy_project/src/__main__.py @@ -12,11 +12,10 @@ # ruff: noqa: E402, I001 from __future__ import annotations -import asyncio from twisted.internet import asyncioreactor # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. -asyncioreactor.install(asyncio.get_event_loop()) # type: ignore[no-untyped-call] +asyncioreactor.install() # type: ignore[no-untyped-call] import os from apify.scrapy import initialize_logging, run_scrapy_actor diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/title.py b/docs/02_guides/code/_scrapy_project/src/spiders/title.py index b92ccb86..d3bced34 100644 --- a/docs/02_guides/code/_scrapy_project/src/spiders/title.py +++ b/docs/02_guides/code/_scrapy_project/src/spiders/title.py @@ -24,6 +24,9 @@ class TitleSpider(Spider): name = 'title_spider' + # Limit the number of pages to scrape. + custom_settings = {'CLOSESPIDER_PAGECOUNT': 10} + def __init__( self, start_urls: list[str], diff --git a/tests/integration/README.md b/tests/integration/README.md index a3b2dbcf..81dad75e 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -94,7 +94,6 @@ async def test_something( output_record = await actor.last_run().key_value_store().get_record('OUTPUT') assert output_record is not None assert output_record['value'] == expected_output - ``` Or you can pass multiple source files with the `source_files` argument, if you need something really complex: diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index 0df1ff38..cd19947d 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,2 +1,3 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER +scrapy~=2.12.0 diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index 283b1935..52a03188 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -2,21 +2,15 @@ from typing import TYPE_CHECKING -import pytest - if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction -@pytest.mark.only async def test_actor_scrapy_title_spider( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: actor_source_files = { - 'requirements.txt': """ - scrapy ~= 2.12 - """, 'src/spiders/title.py': """ from __future__ import annotations from typing import TYPE_CHECKING, Any @@ -32,6 +26,9 @@ async def test_actor_scrapy_title_spider( class TitleSpider(Spider): name = 'title_spider' + # Limit the number of pages to scrape. + custom_settings = {'CLOSESPIDER_PAGECOUNT': 10} + def __init__( self, start_urls: list[str], @@ -61,7 +58,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None import scrapy class TitleItem(scrapy.Item): - url = scrapy.Field + url = scrapy.Field() title = scrapy.Field() """, 'src/settings.py': """ @@ -107,11 +104,10 @@ async def main() -> None: """, 'src/__main__.py': """ from __future__ import annotations - import asyncio from twisted.internet import asyncioreactor # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. - asyncioreactor.install(asyncio.get_event_loop()) + asyncioreactor.install() import os from apify.scrapy import initialize_logging, run_scrapy_actor @@ -133,5 +129,8 @@ async def main() -> None: items = await actor.last_run().dataset().list_items() - assert items.count == 48 - assert items.items == {'blah'} + assert items.count >= 10 + + for item in items.items: + assert 'url' in item + assert 'title' in item From bd507876a19b68721f471ff7221b64b5bd5eed2e Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 7 Feb 2025 11:36:43 +0100 Subject: [PATCH 04/16] rm scrapy condition for sys exit --- .../code/_scrapy_project/.actor/Dockerfile | 16 +++------------- src/apify/_actor.py | 10 ---------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/docs/02_guides/code/_scrapy_project/.actor/Dockerfile b/docs/02_guides/code/_scrapy_project/.actor/Dockerfile index f486156c..3a268039 100644 --- a/docs/02_guides/code/_scrapy_project/.actor/Dockerfile +++ b/docs/02_guides/code/_scrapy_project/.actor/Dockerfile @@ -1,18 +1,8 @@ -FROM apify/actor-python:3.12 +FROM apify/actor-python:3.13 -COPY pyproject.toml ./ +COPY requirements.txt ./ -RUN echo "Python version:" \ - && python --version \ - && echo "Pip version:" \ - && pip --version \ - && echo "Installing Poetry:" \ - && pip install --no-cache-dir poetry~=1.8.0 \ - && echo "Installing dependencies:" \ - && poetry config virtualenvs.create false \ - && poetry install --only main --no-interaction --no-ansi \ - && rm -rf /tmp/.poetry-cache \ - && echo "All installed Python packages:" \ +RUN pip install --no-cache-dir --requirement requirements.txt \ && pip freeze COPY . ./ diff --git a/src/apify/_actor.py b/src/apify/_actor.py index acb03885..26fb0ce8 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -35,13 +35,6 @@ from apify.log import _configure_logging, logger from apify.storages import Dataset, KeyValueStore, RequestQueue -try: - import scrapy # noqa: F401 - - scrapy_installed = True -except ImportError: - scrapy_installed = False - if TYPE_CHECKING: import logging from types import TracebackType @@ -277,9 +270,6 @@ async def finalize() -> None: self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') - elif scrapy_installed: - self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy') - # Otherwise, it will just freeze. else: sys.exit(exit_code) From 0c0fbcc66d5f33eb9996546a1ab66f9842aea5a6 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 7 Feb 2025 12:18:00 +0100 Subject: [PATCH 05/16] Polishment --- docs/02_guides/05_scrapy.mdx | 22 +-- .../code/_scrapy_project/.actor/Dockerfile | 12 -- .../code/_scrapy_project/.actor/actor.json | 13 -- .../_scrapy_project/.actor/input_schema.json | 33 ---- .../code/_scrapy_project/.dockerignore | 155 ------------------ .../02_guides/code/_scrapy_project/.gitignore | 154 ----------------- docs/02_guides/code/_scrapy_project/README.md | 32 ---- .../code/_scrapy_project/requirements.txt | 5 - .../02_guides/code/_scrapy_project/scrapy.cfg | 5 - .../code/_scrapy_project/src/__main__.py | 30 ---- .../code/_scrapy_project/src/items.py | 17 -- .../code/_scrapy_project/src/middlewares.py | 129 --------------- .../code/_scrapy_project/src/pipelines.py | 21 --- .../_scrapy_project/src/spiders/__init__.py | 10 -- .../src/__init__.py | 0 .../code/scrapy_project/src/__main__.py | 19 +++ .../code/scrapy_project/src/items.py | 8 + .../src/main.py | 22 +-- .../src/py.typed | 0 .../src/settings.py | 8 - .../src/spiders/__init__.py} | 0 .../code/scrapy_project/src/spiders/py.typed | 0 .../src/spiders/title.py | 0 src/apify/_proxy_configuration.py | 3 - src/apify/scrapy/__init__.py | 29 +++- src/apify/scrapy/_actor_runner.py | 6 - src/apify/scrapy/_logging_config.py | 6 - src/apify/scrapy/middlewares/apify_proxy.py | 18 +- .../scrapy/pipelines/actor_dataset_push.py | 16 +- src/apify/scrapy/requests.py | 42 ++--- src/apify/scrapy/scheduler.py | 20 +-- src/apify/scrapy/utils.py | 30 +--- 32 files changed, 103 insertions(+), 762 deletions(-) delete mode 100644 docs/02_guides/code/_scrapy_project/.actor/Dockerfile delete mode 100644 docs/02_guides/code/_scrapy_project/.actor/actor.json delete mode 100644 docs/02_guides/code/_scrapy_project/.actor/input_schema.json delete mode 100644 docs/02_guides/code/_scrapy_project/.dockerignore delete mode 100644 docs/02_guides/code/_scrapy_project/.gitignore delete mode 100644 docs/02_guides/code/_scrapy_project/README.md delete mode 100644 docs/02_guides/code/_scrapy_project/requirements.txt delete mode 100644 docs/02_guides/code/_scrapy_project/scrapy.cfg delete mode 100644 docs/02_guides/code/_scrapy_project/src/__main__.py delete mode 100644 docs/02_guides/code/_scrapy_project/src/items.py delete mode 100644 docs/02_guides/code/_scrapy_project/src/middlewares.py delete mode 100644 docs/02_guides/code/_scrapy_project/src/pipelines.py delete mode 100644 docs/02_guides/code/_scrapy_project/src/spiders/__init__.py rename docs/02_guides/code/{_scrapy_project => scrapy_project}/src/__init__.py (100%) create mode 100644 docs/02_guides/code/scrapy_project/src/__main__.py create mode 100644 docs/02_guides/code/scrapy_project/src/items.py rename docs/02_guides/code/{_scrapy_project => scrapy_project}/src/main.py (50%) rename docs/02_guides/code/{_scrapy_project => scrapy_project}/src/py.typed (100%) rename docs/02_guides/code/{_scrapy_project => scrapy_project}/src/settings.py (63%) rename docs/02_guides/code/{_scrapy_project/src/spiders/py.typed => scrapy_project/src/spiders/__init__.py} (100%) create mode 100644 docs/02_guides/code/scrapy_project/src/spiders/py.typed rename docs/02_guides/code/{_scrapy_project => scrapy_project}/src/spiders/title.py (100%) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 2b89ac04..88ca4e0b 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -7,11 +7,11 @@ import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import UnderscoreMainExample from '!!raw-loader!./code/_scrapy_project/src/__main__.py'; -import MainExample from '!!raw-loader!./code/_scrapy_project/src/main.py'; -import ItemsExample from '!!raw-loader!./code/_scrapy_project/src/items.py'; -import SettingsExample from '!!raw-loader!./code/_scrapy_project/src/settings.py'; -import TitleSpiderExample from '!!raw-loader!./code/_scrapy_project/src/spiders/title.py'; +import UnderscoreMainExample from '!!raw-loader!./code/scrapy_project/src/__main__.py'; +import MainExample from '!!raw-loader!./code/scrapy_project/src/main.py'; +import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; +import TitleSpiderExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; +import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; [Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation. @@ -68,24 +68,24 @@ Here is an example of a Scrapy Actor that scrapes the titles of web pages and en {UnderscoreMainExample} </CodeBlock> </TabItem> - <TabItem value="main.py" label="main.py" default> + <TabItem value="main.py" label="main.py"> <CodeBlock className="language-python"> {MainExample} </CodeBlock> </TabItem> - <TabItem value="items.py" label="items.py" default> + <TabItem value="items.py" label="items.py"> <CodeBlock className="language-python"> {ItemsExample} </CodeBlock> </TabItem> - <TabItem value="settings.py" label="settings.py" default> + <TabItem value="spiders/title.py" label="spiders/title.py"> <CodeBlock className="language-python"> - {SettingsExample} + {TitleSpiderExample} </CodeBlock> </TabItem> - <TabItem value="spiders/title.py" label="spiders/title.py" default> + <TabItem value="settings.py" label="settings.py"> <CodeBlock className="language-python"> - {TitleSpiderExample} + {SettingsExample} </CodeBlock> </TabItem> </Tabs> diff --git a/docs/02_guides/code/_scrapy_project/.actor/Dockerfile b/docs/02_guides/code/_scrapy_project/.actor/Dockerfile deleted file mode 100644 index 3a268039..00000000 --- a/docs/02_guides/code/_scrapy_project/.actor/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM apify/actor-python:3.13 - -COPY requirements.txt ./ - -RUN pip install --no-cache-dir --requirement requirements.txt \ - && pip freeze - -COPY . ./ - -RUN python3 -m compileall -q . - -CMD ["python3", "-m", "src"] diff --git a/docs/02_guides/code/_scrapy_project/.actor/actor.json b/docs/02_guides/code/_scrapy_project/.actor/actor.json deleted file mode 100644 index 418b0ffe..00000000 --- a/docs/02_guides/code/_scrapy_project/.actor/actor.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "actorSpecification": 1, - "name": "getting-started-python-scrapy", - "title": "Getting started with Python and Scrapy", - "description": "Scrapes titles of websites using Scrapy.", - "version": "0.0", - "buildTag": "latest", - "meta": { - "templateId": "python-scrapy" - }, - "input": "./input_schema.json", - "dockerfile": "./Dockerfile" -} diff --git a/docs/02_guides/code/_scrapy_project/.actor/input_schema.json b/docs/02_guides/code/_scrapy_project/.actor/input_schema.json deleted file mode 100644 index 6714b865..00000000 --- a/docs/02_guides/code/_scrapy_project/.actor/input_schema.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "title": "Python Scrapy Scraper", - "type": "object", - "schemaVersion": 1, - "properties": { - "startUrls": { - "title": "Start URLs", - "type": "array", - "description": "URLs to start with", - "editor": "requestListSources", - "prefill": [{ "url": "https://crawlee.dev/" }], - "default": [{ "url": "https://crawlee.dev/" }] - }, - "allowedDomains": { - "title": "Allowed domains", - "type": "array", - "description": "Domains that the scraper is allowed to crawl.", - "editor": "json", - "prefill": ["crawlee.dev"], - "default": ["crawlee.dev"] - }, - "proxyConfiguration": { - "sectionCaption": "Proxy and HTTP configuration", - "title": "Proxy configuration", - "type": "object", - "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", - "editor": "proxy", - "prefill": { "useApifyProxy": false }, - "default": { "useApifyProxy": false } - } - }, - "required": ["startUrls"] -} diff --git a/docs/02_guides/code/_scrapy_project/.dockerignore b/docs/02_guides/code/_scrapy_project/.dockerignore deleted file mode 100644 index 6eb49d35..00000000 --- a/docs/02_guides/code/_scrapy_project/.dockerignore +++ /dev/null @@ -1,155 +0,0 @@ -.git -.mise.toml -.nvim.lua -storage - -# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -.python-version - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ diff --git a/docs/02_guides/code/_scrapy_project/.gitignore b/docs/02_guides/code/_scrapy_project/.gitignore deleted file mode 100644 index f4ce363f..00000000 --- a/docs/02_guides/code/_scrapy_project/.gitignore +++ /dev/null @@ -1,154 +0,0 @@ -.mise.toml -.nvim.lua -storage - -# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -.python-version - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ diff --git a/docs/02_guides/code/_scrapy_project/README.md b/docs/02_guides/code/_scrapy_project/README.md deleted file mode 100644 index fb7b8d7f..00000000 --- a/docs/02_guides/code/_scrapy_project/README.md +++ /dev/null @@ -1,32 +0,0 @@ -## Python Scrapy template - -A template example built with Scrapy to scrape page titles from URLs defined in the input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results. - -## Included features - -- **[Apify SDK](https://docs.apify.com/sdk/python/)** for Python - a toolkit for building Apify [Actors](https://apify.com/actors) and scrapers in Python -- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input -- **[Request queue](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-request-queues)** - queues into which you can put the URLs you want to scrape -- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes -- **[Scrapy](https://scrapy.org/)** - a fast high-level web scraping framework - -## How it works - -This code is a Python script that uses Scrapy to scrape web pages and extract data from them. Here's a brief overview of how it works: - -- The script reads the input data from the Actor instance, which is expected to contain a `start_urls` key with a list of URLs to scrape. -- The script then creates a Scrapy spider that will scrape the URLs. This Spider (class `TitleSpider`) is storing URLs and titles. -- Scrapy pipeline is used to save the results to the default dataset associated with the Actor run using the `push_data` method of the Actor instance. -- The script catches any exceptions that occur during the [web scraping](https://apify.com/web-scraping) process and logs an error message using the `Actor.log.exception` method. - -## Resources - -- [Web scraping with Scrapy](https://blog.apify.com/web-scraping-with-scrapy/) -- [Python tutorials in Academy](https://docs.apify.com/academy/python) -- [Alternatives to Scrapy for web scraping in 2023](https://blog.apify.com/alternatives-scrapy-web-scraping/) -- [Beautiful Soup vs. Scrapy for web scraping](https://blog.apify.com/beautiful-soup-vs-scrapy-web-scraping/) -- [Integration with Zapier](https://apify.com/integrations), Make, Google Drive, and others -- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM) -- A short guide on how to build web scrapers using code templates: - -[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w) diff --git a/docs/02_guides/code/_scrapy_project/requirements.txt b/docs/02_guides/code/_scrapy_project/requirements.txt deleted file mode 100644 index 8d2de65b..00000000 --- a/docs/02_guides/code/_scrapy_project/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Feel free to add your Python dependencies below. For formatting guidelines, see: -# https://pip.pypa.io/en/latest/reference/requirements-file-format/ - -apify[scrapy] < 3.0 -scrapy ~= 2.12 diff --git a/docs/02_guides/code/_scrapy_project/scrapy.cfg b/docs/02_guides/code/_scrapy_project/scrapy.cfg deleted file mode 100644 index da962db6..00000000 --- a/docs/02_guides/code/_scrapy_project/scrapy.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[settings] -default = src.settings - -[deploy] -project = src diff --git a/docs/02_guides/code/_scrapy_project/src/__main__.py b/docs/02_guides/code/_scrapy_project/src/__main__.py deleted file mode 100644 index 752e9e4e..00000000 --- a/docs/02_guides/code/_scrapy_project/src/__main__.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Apify Actor integration for Scrapy projects. - -This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's -logging system, and establishing the required environment to run the Scrapy spider within the Apify platform. - -This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally -or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using -`scrapy crawl title_spider`. - -We recommend you do not modify this file unless you really know what you are doing. -""" -# ruff: noqa: E402, I001 - -from __future__ import annotations -from twisted.internet import asyncioreactor - -# Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. -asyncioreactor.install() # type: ignore[no-untyped-call] - -import os -from apify.scrapy import initialize_logging, run_scrapy_actor -from .main import main - -# Ensure the location to the Scrapy settings module is defined. -os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' - - -if __name__ == '__main__': - initialize_logging() - run_scrapy_actor(main()) diff --git a/docs/02_guides/code/_scrapy_project/src/items.py b/docs/02_guides/code/_scrapy_project/src/items.py deleted file mode 100644 index eae7ff23..00000000 --- a/docs/02_guides/code/_scrapy_project/src/items.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Scrapy item models module. - -This module defines Scrapy item models for scraped data. Items represent structured data -extracted by spiders. - -For detailed information on creating and utilizing items, refer to the official documentation: -https://docs.scrapy.org/en/latest/topics/items.html -""" - -from scrapy import Field, Item - - -class TitleItem(Item): - """Represents a title item scraped from a web page.""" - - url = Field() - title = Field() diff --git a/docs/02_guides/code/_scrapy_project/src/middlewares.py b/docs/02_guides/code/_scrapy_project/src/middlewares.py deleted file mode 100644 index 8fea4184..00000000 --- a/docs/02_guides/code/_scrapy_project/src/middlewares.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Scrapy middlewares module. - -This module defines Scrapy middlewares. Middlewares are processing components that handle requests and -responses, typically used for adding custom headers, retrying requests, and handling exceptions. - -There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information -on creating and utilizing them, refer to the official documentation: -https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -https://docs.scrapy.org/en/latest/topics/spider-middleware.html -""" -# ruff: noqa: ARG002, UP028 - -from __future__ import annotations - -from typing import TYPE_CHECKING - -# Useful for handling different item types with a single interface -from scrapy import Request, Spider, signals - -if TYPE_CHECKING: - from collections.abc import Generator, Iterable - - from scrapy.crawler import Crawler - from scrapy.http import Response - - -class TitleSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware: - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response: Response, spider: Spider) -> None: - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output( - self, - response: Response, - result: Iterable, - spider: Spider, - ) -> Generator[Iterable[Request] | None, None, None]: - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception( - self, - response: Response, - exception: BaseException, - spider: Spider, - ) -> Iterable[Request] | None: - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests( - self, start_requests: Iterable[Request], spider: Spider - ) -> Iterable[Request]: # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn't have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider: Spider) -> None: - pass - - -class TitleDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware: - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request: Request, spider: Spider) -> Request | Response | None: - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response: - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None: - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider: Spider) -> None: - pass diff --git a/docs/02_guides/code/_scrapy_project/src/pipelines.py b/docs/02_guides/code/_scrapy_project/src/pipelines.py deleted file mode 100644 index 7a1c9e8b..00000000 --- a/docs/02_guides/code/_scrapy_project/src/pipelines.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Scrapy item pipelines module. - -This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components -that handle the scraped items, typically used for cleaning, validating, and persisting data. - -For detailed information on creating and utilizing item pipelines, refer to the official documentation: -http://doc.scrapy.org/en/latest/topics/item-pipeline.html -""" -# ruff: noqa: ARG002 - -from scrapy import Spider - -from .items import TitleItem - - -class TitleItemPipeline: - """This item pipeline defines processing steps for TitleItem objects scraped by spiders.""" - - def process_item(self, item: TitleItem, spider: Spider) -> TitleItem: - # Do something with the item here, such as cleaning it or persisting it to a database - return item diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py b/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py deleted file mode 100644 index 3a286fc6..00000000 --- a/docs/02_guides/code/_scrapy_project/src/spiders/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Scrapy spiders package. - -This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape -and process data from websites. - -For detailed information on creating and utilizing spiders, refer to the official documentation: -https://docs.scrapy.org/en/latest/topics/spiders.html -""" - -from .title import TitleSpider diff --git a/docs/02_guides/code/_scrapy_project/src/__init__.py b/docs/02_guides/code/scrapy_project/src/__init__.py similarity index 100% rename from docs/02_guides/code/_scrapy_project/src/__init__.py rename to docs/02_guides/code/scrapy_project/src/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/02_guides/code/scrapy_project/src/__main__.py new file mode 100644 index 00000000..7bda6f73 --- /dev/null +++ b/docs/02_guides/code/scrapy_project/src/__main__.py @@ -0,0 +1,19 @@ +# ruff: noqa: E402, I001 + +from __future__ import annotations +from twisted.internet import asyncioreactor + +# Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. +asyncioreactor.install() # type: ignore[no-untyped-call] + +import os +from apify.scrapy import initialize_logging, run_scrapy_actor +from .main import main + +# Ensure the location to the Scrapy settings module is defined. +os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' + + +if __name__ == '__main__': + initialize_logging() + run_scrapy_actor(main()) diff --git a/docs/02_guides/code/scrapy_project/src/items.py b/docs/02_guides/code/scrapy_project/src/items.py new file mode 100644 index 00000000..ba926f2d --- /dev/null +++ b/docs/02_guides/code/scrapy_project/src/items.py @@ -0,0 +1,8 @@ +from scrapy import Field, Item + + +class TitleItem(Item): + """Represents a title item scraped from a web page.""" + + url = Field() + title = Field() diff --git a/docs/02_guides/code/_scrapy_project/src/main.py b/docs/02_guides/code/scrapy_project/src/main.py similarity index 50% rename from docs/02_guides/code/_scrapy_project/src/main.py rename to docs/02_guides/code/scrapy_project/src/main.py index c593435e..eb967296 100644 --- a/docs/02_guides/code/_scrapy_project/src/main.py +++ b/docs/02_guides/code/scrapy_project/src/main.py @@ -1,23 +1,3 @@ -"""Main entry point for the Apify Actor & Scrapy integration. - -This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine -processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by -applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline -for pushing data to the Apify dataset. - -Customization: --------------- - -Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy -components like spiders and handling Actor input. However, make sure you have a clear understanding of your -modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify. - -Documentation: --------------- - -For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and -other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy. -""" # ruff: noqa: I001 from __future__ import annotations @@ -26,7 +6,7 @@ from scrapy.utils.defer import deferred_to_future from apify import Actor -from apify.scrapy.utils import apply_apify_settings +from apify.scrapy import apply_apify_settings # Import your Scrapy spider here. from .spiders.title import TitleSpider as Spider diff --git a/docs/02_guides/code/_scrapy_project/src/py.typed b/docs/02_guides/code/scrapy_project/src/py.typed similarity index 100% rename from docs/02_guides/code/_scrapy_project/src/py.typed rename to docs/02_guides/code/scrapy_project/src/py.typed diff --git a/docs/02_guides/code/_scrapy_project/src/settings.py b/docs/02_guides/code/scrapy_project/src/settings.py similarity index 63% rename from docs/02_guides/code/_scrapy_project/src/settings.py rename to docs/02_guides/code/scrapy_project/src/settings.py index f3f0b696..4c5aaacb 100644 --- a/docs/02_guides/code/_scrapy_project/src/settings.py +++ b/docs/02_guides/code/scrapy_project/src/settings.py @@ -1,11 +1,3 @@ -"""Scrapy settings module. - -This module contains Scrapy settings for the project, defining various configurations and options. - -For more comprehensive details on Scrapy settings, refer to the official documentation: -http://doc.scrapy.org/en/latest/topics/settings.html -""" - BOT_NAME = 'titlebot' DEPTH_LIMIT = 1 LOG_LEVEL = 'INFO' diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/py.typed b/docs/02_guides/code/scrapy_project/src/spiders/__init__.py similarity index 100% rename from docs/02_guides/code/_scrapy_project/src/spiders/py.typed rename to docs/02_guides/code/scrapy_project/src/spiders/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/02_guides/code/scrapy_project/src/spiders/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/title.py b/docs/02_guides/code/scrapy_project/src/spiders/title.py similarity index 100% rename from docs/02_guides/code/_scrapy_project/src/spiders/title.py rename to docs/02_guides/code/scrapy_project/src/spiders/title.py diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 6fa64f56..88c2a016 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -10,7 +10,6 @@ import httpx from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import ignore_docs from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo from crawlee.proxy_configuration import _NewUrlFunction @@ -28,7 +27,6 @@ SESSION_ID_MAX_LENGTH = 50 -@ignore_docs def is_url(url: str) -> bool: """Check if the given string is a valid URL.""" try: @@ -104,7 +102,6 @@ class ProxyConfiguration(CrawleeProxyConfiguration): _configuration: Configuration - @ignore_docs def __init__( self, *, diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 55b88076..dfab9943 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,13 +1,30 @@ -from ._actor_runner import run_scrapy_actor -from ._logging_config import initialize_logging -from .requests import to_apify_request, to_scrapy_request -from .scheduler import ApifyScheduler -from .utils import get_basic_auth_header, get_running_event_loop_id +from crawlee._utils.try_import import install_import_hook as _install_import_hook +from crawlee._utils.try_import import try_import as _try_import + +_install_import_hook(__name__) + +# The following imports use try_import to handle optional dependencies, as they may not always be available. + +with _try_import(__name__, 'run_scrapy_actor'): + from ._actor_runner import run_scrapy_actor + +with _try_import(__name__, 'initialize_logging'): + from ._logging_config import initialize_logging + +with _try_import(__name__, 'to_apify_request', 'to_scrapy_request'): + from .requests import to_apify_request, to_scrapy_request + +with _try_import(__name__, 'ApifyScheduler'): + from .scheduler import ApifyScheduler + +with _try_import(__name__, 'apply_apify_settings', 'get_basic_auth_header'): + from .utils import apply_apify_settings, get_basic_auth_header + __all__ = [ 'ApifyScheduler', + 'apply_apify_settings', 'get_basic_auth_header', - 'get_running_event_loop_id', 'initialize_logging', 'run_scrapy_actor', 'to_apify_request', diff --git a/src/apify/scrapy/_actor_runner.py b/src/apify/scrapy/_actor_runner.py index 696b7a7f..390b2fc3 100644 --- a/src/apify/scrapy/_actor_runner.py +++ b/src/apify/scrapy/_actor_runner.py @@ -1,9 +1,3 @@ -"""Runner for Apify Actors using Twisted's reactor. - -This module provides functions to run your Actor with Scrapy project inside within the Twisted -reactor by bridging asyncio coroutines with Twisted Deferreds. -""" - from __future__ import annotations import asyncio diff --git a/src/apify/scrapy/_logging_config.py b/src/apify/scrapy/_logging_config.py index 13ea8299..45cd2908 100644 --- a/src/apify/scrapy/_logging_config.py +++ b/src/apify/scrapy/_logging_config.py @@ -1,9 +1,3 @@ -"""Logging configuration for Apify Actor & Scrapy integration. - -This module configures a custom logging system for Apify Actors and monkey-patches Scrapy's logging -to use a Apify log formatter and settings. -""" - from __future__ import annotations import logging diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 7d7eaaec..4721a248 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -3,19 +3,15 @@ from typing import TYPE_CHECKING from urllib.parse import ParseResult, urlparse -try: - if TYPE_CHECKING: - from scrapy import Request, Spider - from scrapy.crawler import Crawler - from scrapy.core.downloader.handlers.http11 import TunnelError - from scrapy.exceptions import NotConfigured -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', - ) from exc +from scrapy.core.downloader.handlers.http11 import TunnelError +from scrapy.exceptions import NotConfigured from apify import Actor, ProxyConfiguration -from apify.scrapy.utils import get_basic_auth_header +from apify.scrapy import get_basic_auth_header + +if TYPE_CHECKING: + from scrapy import Request, Spider + from scrapy.crawler import Crawler class ApifyHttpProxyMiddleware: diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index d2d983cc..995af774 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -1,19 +1,17 @@ from __future__ import annotations +from logging import getLogger from typing import TYPE_CHECKING from itemadapter.adapter import ItemAdapter -try: - if TYPE_CHECKING: - from scrapy import Item, Spider -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', - ) from exc - from apify import Actor +if TYPE_CHECKING: + from scrapy import Item, Spider + +logger = getLogger(__name__) + class ActorDatasetPushPipeline: """A Scrapy pipeline for pushing items to an Actor's default dataset. @@ -28,6 +26,6 @@ async def process_item( ) -> Item: """Pushes the provided Scrapy item to the Actor's default dataset.""" item_dict = ItemAdapter(item).asdict() - Actor.log.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.') + logger.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.') await Actor.push_data(item_dict) return item diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 4ded045f..b4d3fd83 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -2,25 +2,19 @@ import codecs import pickle +from logging import getLogger from typing import Any, cast -from apify_shared.utils import ignore_docs - -try: - from scrapy import Request, Spider - from scrapy.http.headers import Headers - from scrapy.utils.request import request_from_dict -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', - ) from exc +from scrapy import Request, Spider +from scrapy.http.headers import Headers +from scrapy.utils.request import request_from_dict from crawlee import Request as CrawleeRequest from crawlee._types import HttpHeaders from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id -from apify import Actor +logger = getLogger(__name__) def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: @@ -31,7 +25,6 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times')) -@ignore_docs def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None: """Convert a Scrapy request to an Apify request. @@ -43,13 +36,10 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest The converted Apify request if the conversion was successful, otherwise None. """ if not isinstance(scrapy_request, Request): - Actor.log.warning( # type: ignore[unreachable] - 'Failed to convert to Apify request: Scrapy request must be a Request instance.' - ) + logger.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.') # type: ignore[unreachable] return None - call_id = crypto_random_object_id(8) - Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...') + logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...') try: if _is_request_produced_by_middleware(scrapy_request): @@ -84,7 +74,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest if isinstance(scrapy_request.headers, Headers): apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict())) else: - Actor.log.warning( # type: ignore[unreachable] + logger.warning( # type: ignore[unreachable] f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}' ) @@ -97,14 +87,13 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded except Exception as exc: - Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}') + logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}') return None - Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}') + logger.debug(f'scrapy_request was converted to the apify_request={apify_request}') return apify_request -@ignore_docs def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: """Convert an Apify request to a Scrapy request. @@ -122,15 +111,14 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: if not isinstance(cast(Any, apify_request), CrawleeRequest): raise TypeError('apify_request must be a crawlee.Request instance') - call_id = crypto_random_object_id(8) - Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...') + logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...') # If the apify_request comes from the Scrapy if 'scrapy_request' in apify_request.user_data: # Deserialize the Scrapy Request from the apify_request. # - This process involves decoding the base64-encoded request data and reconstructing # the Scrapy Request object from its dictionary representation. - Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...') + logger.debug('Restoring the Scrapy Request from the apify_request...') scrapy_request_dict_encoded = apify_request.user_data['scrapy_request'] if not isinstance(scrapy_request_dict_encoded, str): @@ -144,7 +132,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: if not isinstance(scrapy_request, Request): raise TypeError('scrapy_request must be an instance of the Request class') - Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...') + logger.debug(f'Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...') # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} @@ -154,7 +142,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: # If the apify_request comes directly from the Request Queue, typically start URLs else: - Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)') + logger.debug('Gonna create a new Scrapy Request (cannot be restored)') scrapy_request = Request( url=apify_request.url, @@ -173,5 +161,5 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: if apify_request.user_data: scrapy_request.meta['userData'] = apify_request.user_data - Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}') + logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 92676944..70abeb55 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -11,9 +11,7 @@ from scrapy.core.scheduler import BaseScheduler from scrapy.utils.reactor import is_asyncio_reactor_installed -from crawlee._utils.crypto import crypto_random_object_id - -from apify import Actor, Configuration +from apify import Configuration from apify.apify_storage_client import ApifyStorageClient from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.storages import RequestQueue @@ -204,18 +202,17 @@ def enqueue_request(self, request: Request) -> bool: Returns: True if the request was successfully enqueued, False otherwise. """ - call_id = crypto_random_object_id(8) - Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...') + logger.debug(f'ApifyScheduler.enqueue_request was called (scrapy_request={request})...') if not isinstance(self.spider, Spider): raise TypeError('self.spider must be an instance of the Spider class') apify_request = to_apify_request(request, spider=self.spider) if apify_request is None: - Actor.log.error(f'Request {request} could not be converted to Apify request.') + logger.error(f'Request {request} could not be converted to Apify request.') return False - Actor.log.debug(f'[{call_id}]: Converted to apify_request: {apify_request}') + logger.debug(f'Converted to apify_request: {apify_request}') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') @@ -225,7 +222,7 @@ def enqueue_request(self, request: Request) -> bool: traceback.print_exc() raise - Actor.log.debug(f'[{call_id}]: rq.add_request result: {result}') + logger.debug(f'rq.add_request result: {result}') return bool(result.was_already_present) def next_request(self) -> Request | None: @@ -234,8 +231,7 @@ def next_request(self) -> Request | None: Returns: The next request, or None if there are no more requests. """ - call_id = crypto_random_object_id(8) - Actor.log.debug(f'[{call_id}]: next_request called...') + logger.debug('next_request called...') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') @@ -245,7 +241,7 @@ def next_request(self) -> Request | None: traceback.print_exc() raise - Actor.log.debug(f'[{call_id}]: Fetched apify_request: {apify_request}') + logger.debug(f'Fetched apify_request: {apify_request}') if apify_request is None: return None @@ -261,5 +257,5 @@ def next_request(self) -> Request | None: raise scrapy_request = to_scrapy_request(apify_request, spider=self.spider) - Actor.log.debug(f'[{call_id}]: Converted to scrapy_request: {scrapy_request}') + logger.debug(f'Converted to scrapy_request: {scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index d248a639..894cda95 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -1,26 +1,16 @@ from __future__ import annotations -import asyncio from base64 import b64encode from typing import TYPE_CHECKING from urllib.parse import unquote -from apify_shared.utils import ignore_docs +from scrapy.utils.project import get_project_settings +from scrapy.utils.python import to_bytes -try: - from scrapy.utils.project import get_project_settings - from scrapy.utils.python import to_bytes +if TYPE_CHECKING: + from scrapy.settings import Settings - if TYPE_CHECKING: - from scrapy.settings import Settings -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run ' - '"pip install apify[scrapy]".' - ) from exc - -@ignore_docs def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes: """Generate a basic authentication header for the given username and password.""" string = f'{unquote(username)}:{unquote(password)}' @@ -28,18 +18,6 @@ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'la return b'Basic ' + b64encode(user_pass) -@ignore_docs -def get_running_event_loop_id() -> int: - """Get the ID of the currently running event loop. - - It could be useful mainly for debugging purposes. - - Returns: - The ID of the event loop. - """ - return id(asyncio.get_running_loop()) - - def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings: """Integrates Apify configuration into a Scrapy project settings. From 357cc27af9fce1188b17a2a58efbdd8c487256a6 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 7 Feb 2025 12:24:23 +0100 Subject: [PATCH 06/16] revert non-intentionally changes in proxy conf --- src/apify/_proxy_configuration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 88c2a016..6fa64f56 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -10,6 +10,7 @@ import httpx from apify_shared.consts import ApifyEnvVars +from apify_shared.utils import ignore_docs from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo from crawlee.proxy_configuration import _NewUrlFunction @@ -27,6 +28,7 @@ SESSION_ID_MAX_LENGTH = 50 +@ignore_docs def is_url(url: str) -> bool: """Check if the given string is a valid URL.""" try: @@ -102,6 +104,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration): _configuration: Configuration + @ignore_docs def __init__( self, *, From e7f0aebaa5520da513f21d8d092227eddf06bbd1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 7 Feb 2025 15:02:37 +0100 Subject: [PATCH 07/16] Update the Scrapy guide --- docs/02_guides/05_scrapy.mdx | 66 +++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 88ca4e0b..97463e2e 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -13,54 +13,56 @@ import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; import TitleSpiderExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; -[Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation. +[Scrapy](https://scrapy.org/) is an open-source web scraping framework for Python. It provides tools for defining scrapers, extracting data from web pages, following links, and handling pagination. With the Apify SDK, Scrapy projects can be converted into Apify [Actors](https://docs.apify.com/platform/actors), integrated with Apify [storages](https://docs.apify.com/platform/storage), and executed on the Apify [platform](https://docs.apify.com/platform). -:::tip +## Integrating Scrapy with the Apify platform -Our CLI now supports transforming Scrapy projects into Apify Actors with a single command! Check out the [Scrapy migration guide](https://docs.apify.com/cli/docs/integrating-scrapy) for more information. +The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. -::: +<CodeBlock className="language-python" title="__main.py__: The Actor entry point "> + {UnderscoreMainExample} +</CodeBlock> -Some of the key features of Scrapy for web scraping include: +In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed. -- **Request and response handling** - Scrapy provides an easy-to-use interface for making HTTP requests and handling responses, -allowing you to navigate through web pages and extract data. -- **Robust Spider framework** - Scrapy has a spider framework that allows you to define how to scrape data from websites, -including how to follow links, how to handle pagination, and how to parse the data. -- **Built-in data extraction** - Scrapy includes built-in support for data extraction using XPath and CSS selectors, -allowing you to easily extract data from HTML and XML documents. -- **Integration with other tool** - Scrapy can be integrated with other Python tools like BeautifulSoup and Selenium for more advanced scraping tasks. +<CodeBlock className="language-python" title="main.py: The Actor main coroutine"> + {MainExample} +</CodeBlock> -## Using Scrapy template +Within the Actor's main coroutine, the Actor's input is processed as usual. The function `apify.scrapy.apply_apify_settings` is then used to configure Scrapy settings with Apify-specific components before the spider is executed. The key components and other helper functions are described in the next section. -The fastest way to start using Scrapy in Apify Actors is by leveraging the [Scrapy Actor template](https://apify.com/templates/categories/python). This template provides a pre-configured structure and setup necessary to integrate Scrapy into your Actors seamlessly. It includes: setting up the Scrapy settings, `asyncio` reactor, Actor logger, and item pipeline as necessary to make Scrapy spiders run in Actors and save their outputs in Apify datasets. +## Key integration components -## Manual setup +The Apify SDK provides several custom components to support integration with the Apify platform: -If you prefer not to use the template, you will need to manually configure several components to integrate Scrapy with the Apify SDK. +- [`apify.scrapy.ApifyScheduler`](https://docs.apify.com/sdk/python/reference/class/ApifyScheduler) - Replaces Scrapy's default [scheduler](https://docs.scrapy.org/en/latest/topics/scheduler.html) with one that uses Apify's [request queue](https://docs.apify.com/platform/storage/request-queue) for storing requests. It manages enqueuing, dequeuing, and maintaining the state and priority of requests. +- [`apify.scrapy.ActorDatasetPushPipeline`](https://docs.apify.com/sdk/python/reference/class/ActorDatasetPushPipeline) - A Scrapy [item pipeline](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) that pushes scraped items to Apify's [dataset](https://docs.apify.com/platform/storage/dataset). When enabled, every item produced by the spider is sent to the dataset. +- [`apify.scrapy.ApifyHttpProxyMiddleware`](https://docs.apify.com/sdk/python/reference/class/ApifyHttpProxyMiddleware) - A Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that manages proxy configurations. This middleware replaces Scrapy's default `HttpProxyMiddleware` to facilitate the use of Apify's proxy service. -### Event loop & reactor +Additional helper functions in the [`apify.scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) subpackage include: -The Apify SDK is built on Python's asynchronous [`asyncio`](https://docs.python.org/3/library/asyncio.html) library, whereas Scrapy uses [`twisted`](https://twisted.org/) for its asynchronous operations. To make these two frameworks work together, you need to: +- `apply_apify_settings` - Applies Apify-specific components to Scrapy settings. +- `to_apify_request` and `to_scrapy_request` - Convert between Apify and Scrapy request objects. +- `initialize_logging` - Configures logging for the Actor environment. +- `run_scrapy_actor` - Bridges asyncio and Twisted event loops. -- Set the [`AsyncioSelectorReactor`](https://docs.scrapy.org/en/latest/topics/asyncio.html#installing-the-asyncio-reactor) in Scrapy's project settings: This reactor is `twisted`'s implementation of the `asyncio` event loop, enabling compatibility between the two libraries. -- Install [`nest_asyncio`](https://pypi.org/project/nest-asyncio/): The `nest_asyncio` package allows the asyncio event loop to run within an already running loop, which is essential for integration with the Apify SDK. +## Create a new Apify-Scrapy project -By making these adjustments, you can ensure collaboration between `twisted`-based Scrapy and the `asyncio`-based Apify SDK. +The simplest way to start using Scrapy in Apify Actors is to use the [Scrapy Actor template](https://apify.com/templates/categories/python). The template provides a pre-configured project structure and setup that includes all necessary components to run Scrapy spiders as Actors and store their output in Apify datasets. If you prefer manual setup, refer to the example Actor section below for configuration details. -### Other components +## Wrapping an existing Scrapy project -We also prepared other Scrapy components to work with Apify SDK, they are available in the [`apify/scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) sub-package. These components include: +The Apify CLI supports converting an existing Scrapy project into an Apify Actor with a single command. The CLI expects the project to follow the standard Scrapy layout (including a `scrapy.cfg` file in the project root). During the wrapping process, the CLI: -- `ApifyScheduler`: A Scrapy scheduler that uses the Apify Request Queue to manage requests. -- `ApifyHttpProxyMiddleware`: A Scrapy middleware for working with Apify proxies. -- `ActorDatasetPushPipeline`: A Scrapy item pipeline that pushes scraped items into the Apify dataset. +- Creates the necessary files and directories for an Apify Actor. +- Installs the Apify SDK and required dependencies. +- Updates Scrapy settings to include Apify-specific components. -The module contains other helper functions, like `apply_apify_settings` for applying these components to Scrapy settings, and `to_apify_request` and `to_scrapy_request` for converting between Apify and Scrapy request objects. +For further details, see the [Scrapy migration guide](https://docs.apify.com/cli/docs/integrating-scrapy). ## Example Actor -Here is an example of a Scrapy Actor that scrapes the titles of web pages and enqueues all links found on each page. This example is identical to the one provided in the Apify Actor templates. +The following example demonstrates a Scrapy Actor that scrapes page titles and enqueues links found on each page. This example aligns with the structure provided in the Apify Actor templates. <Tabs> <TabItem value="__main__.py" label="__main.py__"> @@ -93,3 +95,11 @@ Here is an example of a Scrapy Actor that scrapes the titles of web pages and en ## Conclusion In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Apify CLI: Integrating Scrapy projects](https://docs.apify.com/cli/docs/integrating-scrapy) +- [Apify: Run Scrapy spiders on Apify](https://apify.com/run-scrapy-in-cloud) +- [Apify templates: Pyhon Actor Scrapy template](https://apify.com/templates/python-scrapy) +- [Apify store: Scrapy Books Example Actor](https://apify.com/vdusek/scrapy-books-example) +- [Scrapy: Official documentation](https://docs.scrapy.org/) From 1321f82d8ee111aa2e0cd67fc1bfe7ac20d7dc0f Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Mon, 10 Feb 2025 08:56:43 +0100 Subject: [PATCH 08/16] add async thread helper class --- src/apify/scrapy/_async_thread.py | 124 ++++++++++++++++++++++++++++++ src/apify/scrapy/scheduler.py | 116 ++++------------------------ 2 files changed, 137 insertions(+), 103 deletions(-) create mode 100644 src/apify/scrapy/_async_thread.py diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py new file mode 100644 index 00000000..df98b0ff --- /dev/null +++ b/src/apify/scrapy/_async_thread.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import asyncio +import threading +from concurrent import futures +from datetime import timedelta +from logging import getLogger +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Coroutine + +logger = getLogger(__name__) + + +class AsyncThread: + """Class for running an asyncio event loop in a separate thread. + + This allows running asynchronous coroutines from synchronous code by executingthem on an event loop + that runs in its own dedicated thread. + """ + + def __init__(self) -> None: + self._eventloop = asyncio.new_event_loop() + + # Start the event loop in a dedicated daemon thread. + self._thread = threading.Thread( + target=self._start_event_loop, + daemon=True, + ) + self._thread.start() + + def run_coro( + self, + coro: Coroutine, + timeout: timedelta = timedelta(seconds=60), + ) -> Any: + """Run a coroutine on an event loop running in a separate thread. + + This method schedules the coroutine to run on the event loop and blocks until the coroutine completes + or the specified timeout is reached. + + Args: + coro: The coroutine to run. + timeout: The maximum number of seconds to wait for the coroutine to finish. + + Returns: + The result returned by the coroutine. + + Raises: + TimeoutError: If the coroutine does not complete within the timeout. + Exception: Any exception raised during coroutine execution. + """ + if not self._eventloop.is_running(): + logger.warning('Event loop is not running! Ignoring coroutine execution.') + return None + + # Submit the coroutine to the event loop running in the other thread. + future = asyncio.run_coroutine_threadsafe(coro, self._eventloop) + try: + # Wait for the coroutine's result until the specified timeout. + return future.result(timeout=timeout.total_seconds()) + except futures.TimeoutError as exc: + logger.exception('Coroutine execution timed out.', exc_info=exc) + raise + except Exception as exc: + logger.exception('Coroutine execution raised an exception.', exc_info=exc) + raise + + def close(self, timeout: timedelta = timedelta(seconds=60)) -> None: + """Close the event loop and its thread gracefully. + + This method cancels all pending tasks, stops the event loop, and waits for the thread to exit. + If the thread does not exit within the given timeout, a forced shutdown is attempted. + + Args: + timeout: The maximum number of seconds to wait for the event loop thread to exit. + """ + if self._eventloop.is_running(): + # Cancel all pending tasks in the event loop. + self.run_coro(self._shutdown_tasks()) + + # Schedule the event loop to stop. + self._eventloop.call_soon_threadsafe(self._eventloop.stop) + + # Wait for the event loop thread to finish execution. + self._thread.join(timeout=timeout.total_seconds()) + + # If the thread is still running after the timeout, force a shutdown. + if self._thread.is_alive(): + logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...') + self._force_exit_event_loop() + + def _start_event_loop(self) -> None: + """Set up and run the asyncio event loop in the dedicated thread.""" + asyncio.set_event_loop(self._eventloop) + try: + self._eventloop.run_forever() + finally: + self._eventloop.close() + logger.debug('Asyncio event loop has been closed.') + + async def _shutdown_tasks(self) -> None: + """Cancel all pending tasks in the event loop.""" + # Retrieve all tasks for the event loop, excluding the current task. + tasks = [task for task in asyncio.all_tasks(self._eventloop) if task is not asyncio.current_task()] + if not tasks: + return + + # Cancel each pending task. + for task in tasks: + task.cancel() + + # Wait until all tasks have been cancelled or finished. + await asyncio.gather(*tasks, return_exceptions=True) + + def _force_exit_event_loop(self) -> None: + """Forcefully shut down the event loop and its thread.""" + try: + logger.info('Forced shutdown of the event loop and its thread...') + self._eventloop.call_soon_threadsafe(self._eventloop.stop) + self._thread.join(timeout=5) + except Exception as exc: + logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc) diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 70abeb55..d658efeb 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -1,100 +1,25 @@ from __future__ import annotations -import asyncio -import threading import traceback -from concurrent import futures from logging import getLogger -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from scrapy import Spider from scrapy.core.scheduler import BaseScheduler from scrapy.utils.reactor import is_asyncio_reactor_installed +from ._async_thread import AsyncThread +from .requests import to_apify_request, to_scrapy_request from apify import Configuration from apify.apify_storage_client import ApifyStorageClient -from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.storages import RequestQueue if TYPE_CHECKING: - from collections.abc import Coroutine - from scrapy.http.request import Request from twisted.internet.defer import Deferred logger = getLogger(__name__) -_TIMEOUT = 60 -"""The timeout for waiting on asyncio coroutines to finish.""" - - -def _start_event_loop(eventloop: asyncio.AbstractEventLoop) -> None: - """Set and run the event loop until it is stopped. - - Args: - eventloop: The asyncio event loop to run. - """ - asyncio.set_event_loop(eventloop) - try: - eventloop.run_forever() - finally: - eventloop.close() - logger.debug('Asyncio event loop has been closed.') - - -def _run_async_coro(eventloop: asyncio.AbstractEventLoop, coro: Coroutine) -> Any: - """Run a coroutine on the given loop in our separate thread, waiting for its result. - - Args: - eventloop: The asyncio event loop to run the coroutine on. - coro: The coroutine to run. - - Returns: - The result of the coroutine. - """ - if not eventloop.is_running(): - logger.warning('Event loop is not running! Ignoring coroutine execution.') - return None - - future = asyncio.run_coroutine_threadsafe(coro, eventloop) - try: - return future.result(timeout=_TIMEOUT) - except futures.TimeoutError as exc: - logger.exception('Coroutine execution timed out.', exc_info=exc) - raise - except Exception as exc: - logger.exception('Coroutine execution raised an exception.', exc_info=exc) - raise - - -async def _shutdown_async_tasks(eventloop: asyncio.AbstractEventLoop) -> None: - """Cancel and wait for all pending tasks on the current event loop. - - Args: - eventloop: The asyncio event loop to cancel tasks on. - """ - tasks = [task for task in asyncio.all_tasks(eventloop) if task is not asyncio.current_task()] - if not tasks: - return - for task in tasks: - task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - - -def _force_exit_event_loop(eventloop: asyncio.AbstractEventLoop, thread: threading.Thread) -> None: - """Forcefully shut down the event loop and its thread. - - Args: - eventloop: The asyncio event loop to stop. - thread: The thread running the event loop. - """ - try: - logger.info('Forced shutdown of the event loop and its thread...') - eventloop.call_soon_threadsafe(eventloop.stop) - thread.join(timeout=5) - except Exception as exc: - logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc) - class ApifyScheduler(BaseScheduler): """A Scrapy scheduler that uses the Apify request queue to manage requests. @@ -112,10 +37,8 @@ def __init__(self) -> None: self._rq: RequestQueue | None = None self.spider: Spider | None = None - # Create a new event loop and run it in a separate thread. - self._eventloop = asyncio.new_event_loop() - self._thread = threading.Thread(target=lambda: _start_event_loop(self._eventloop), daemon=True) - self._thread.start() + # A thread with the asyncio event loop to run coroutines on. + self._async_thread = AsyncThread() def open(self, spider: Spider) -> Deferred[None] | None: """Open the scheduler. @@ -133,7 +56,7 @@ async def open_rq() -> RequestQueue: return await RequestQueue.open() try: - self._rq = _run_async_coro(self._eventloop, open_rq()) + self._rq = self._async_thread.run_coro(open_rq()) except Exception: traceback.print_exc() raise @@ -150,20 +73,7 @@ def close(self, reason: str) -> None: """ logger.debug(f'Closing {self.__class__.__name__} due to {reason}...') try: - if self._eventloop.is_running(): - # Cancel all pending tasks in the event loop. - _run_async_coro(self._eventloop, _shutdown_async_tasks(self._eventloop)) - - # Stop the event loop. - self._eventloop.call_soon_threadsafe(self._eventloop.stop) - - # Wait for the event loop thread to exit. - self._thread.join(timeout=_TIMEOUT) - - # If the thread is still alive, execute a forced shutdown. - if self._thread.is_alive(): - logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...') - _force_exit_event_loop(self._eventloop, self._thread) + self._async_thread.close() except KeyboardInterrupt: logger.warning('Shutdown interrupted by KeyboardInterrupt!') @@ -184,7 +94,7 @@ def has_pending_requests(self) -> bool: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - is_finished = _run_async_coro(self._eventloop, self._rq.is_finished()) + is_finished = self._async_thread.run_coro(self._rq.is_finished()) except Exception: traceback.print_exc() raise @@ -217,7 +127,7 @@ def enqueue_request(self, request: Request) -> bool: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - result = _run_async_coro(self._eventloop, self._rq.add_request(apify_request)) + result = self._async_thread.run_coro(self._rq.add_request(apify_request)) except Exception: traceback.print_exc() raise @@ -236,7 +146,7 @@ def next_request(self) -> Request | None: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - apify_request = _run_async_coro(self._eventloop, self._rq.fetch_next_request()) + apify_request = self._async_thread.run_coro(self._rq.fetch_next_request()) except Exception: traceback.print_exc() raise @@ -248,10 +158,10 @@ def next_request(self) -> Request | None: if not isinstance(self.spider, Spider): raise TypeError('self.spider must be an instance of the Spider class') - # Let the request queue know that the request is being handled. Every request should be marked as handled, - # retrying is handled by the Scrapy's RetryMiddleware. + # Let the request queue know that the request is being handled. Every request should + # be marked as handled, retrying is handled by the Scrapy's RetryMiddleware. try: - _run_async_coro(self._eventloop, self._rq.mark_request_as_handled(apify_request)) + self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request)) except Exception: traceback.print_exc() raise From f1a7fd7ce455de0ebf872981f8a2ac446d1e2a94 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Mon, 10 Feb 2025 14:44:55 +0100 Subject: [PATCH 09/16] address feedback --- .github/workflows/run_code_checks.yaml | 6 +- docs/02_guides/05_scrapy.mdx | 8 +- .../code/scrapy_project/src/__main__.py | 6 +- .../code/scrapy_project/src/items.py | 2 + .../02_guides/code/scrapy_project/src/main.py | 4 +- .../code/scrapy_project/src/settings.py | 10 -- .../src/{spiders/title.py => spiders.py} | 6 +- .../scrapy_project/src/spiders/__init__.py | 0 .../code/scrapy_project/src/spiders/py.typed | 0 pyproject.toml | 10 ++ src/apify/scrapy/_async_thread.py | 6 +- src/apify/scrapy/scheduler.py | 2 +- tests/integration/_utils.py | 6 + tests/integration/conftest.py | 4 +- tests/integration/test_actor_scrapy.py | 126 +++--------------- 15 files changed, 53 insertions(+), 143 deletions(-) rename docs/02_guides/code/scrapy_project/src/{spiders/title.py => spiders.py} (94%) delete mode 100644 docs/02_guides/code/scrapy_project/src/spiders/__init__.py delete mode 100644 docs/02_guides/code/scrapy_project/src/spiders/py.typed diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml index 058739da..dd9b4d23 100644 --- a/.github/workflows/run_code_checks.yaml +++ b/.github/workflows/run_code_checks.yaml @@ -19,9 +19,9 @@ jobs: name: Unit tests uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main - # docs_check: - # name: Docs check - # uses: apify/workflows/.github/workflows/python_docs_check.yaml@main + docs_check: + name: Docs check + uses: apify/workflows/.github/workflows/python_docs_check.yaml@main integration_tests: name: Integration tests diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 97463e2e..250d7953 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem'; import UnderscoreMainExample from '!!raw-loader!./code/scrapy_project/src/__main__.py'; import MainExample from '!!raw-loader!./code/scrapy_project/src/main.py'; import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; -import TitleSpiderExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; +import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders.py'; import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; [Scrapy](https://scrapy.org/) is an open-source web scraping framework for Python. It provides tools for defining scrapers, extracting data from web pages, following links, and handling pagination. With the Apify SDK, Scrapy projects can be converted into Apify [Actors](https://docs.apify.com/platform/actors), integrated with Apify [storages](https://docs.apify.com/platform/storage), and executed on the Apify [platform](https://docs.apify.com/platform). @@ -48,7 +48,7 @@ Additional helper functions in the [`apify.scrapy`](https://github.com/apify/api ## Create a new Apify-Scrapy project -The simplest way to start using Scrapy in Apify Actors is to use the [Scrapy Actor template](https://apify.com/templates/categories/python). The template provides a pre-configured project structure and setup that includes all necessary components to run Scrapy spiders as Actors and store their output in Apify datasets. If you prefer manual setup, refer to the example Actor section below for configuration details. +The simplest way to start using Scrapy in Apify Actors is to use the [Scrapy Actor template](https://apify.com/templates/python-scrapy). The template provides a pre-configured project structure and setup that includes all necessary components to run Scrapy spiders as Actors and store their output in Apify datasets. If you prefer manual setup, refer to the example Actor section below for configuration details. ## Wrapping an existing Scrapy project @@ -80,9 +80,9 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e {ItemsExample} </CodeBlock> </TabItem> - <TabItem value="spiders/title.py" label="spiders/title.py"> + <TabItem value="spiders.py" label="spiders.py"> <CodeBlock className="language-python"> - {TitleSpiderExample} + {SpidersExample} </CodeBlock> </TabItem> <TabItem value="settings.py" label="settings.py"> diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/02_guides/code/scrapy_project/src/__main__.py index 7bda6f73..3dcbf75c 100644 --- a/docs/02_guides/code/scrapy_project/src/__main__.py +++ b/docs/02_guides/code/scrapy_project/src/__main__.py @@ -1,13 +1,15 @@ -# ruff: noqa: E402, I001 - from __future__ import annotations + from twisted.internet import asyncioreactor # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. asyncioreactor.install() # type: ignore[no-untyped-call] import os + from apify.scrapy import initialize_logging, run_scrapy_actor + +# Import your main Actor coroutine here. from .main import main # Ensure the location to the Scrapy settings module is defined. diff --git a/docs/02_guides/code/scrapy_project/src/items.py b/docs/02_guides/code/scrapy_project/src/items.py index ba926f2d..6579083f 100644 --- a/docs/02_guides/code/scrapy_project/src/items.py +++ b/docs/02_guides/code/scrapy_project/src/items.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from scrapy import Field, Item diff --git a/docs/02_guides/code/scrapy_project/src/main.py b/docs/02_guides/code/scrapy_project/src/main.py index eb967296..a5586a25 100644 --- a/docs/02_guides/code/scrapy_project/src/main.py +++ b/docs/02_guides/code/scrapy_project/src/main.py @@ -1,5 +1,3 @@ -# ruff: noqa: I001 - from __future__ import annotations from scrapy.crawler import CrawlerRunner @@ -9,7 +7,7 @@ from apify.scrapy import apply_apify_settings # Import your Scrapy spider here. -from .spiders.title import TitleSpider as Spider +from .spiders import TitleSpider as Spider async def main() -> None: diff --git a/docs/02_guides/code/scrapy_project/src/settings.py b/docs/02_guides/code/scrapy_project/src/settings.py index 4c5aaacb..ed51668a 100644 --- a/docs/02_guides/code/scrapy_project/src/settings.py +++ b/docs/02_guides/code/scrapy_project/src/settings.py @@ -6,13 +6,3 @@ SPIDER_MODULES = ['src.spiders'] TELNETCONSOLE_ENABLED = False TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' - -ITEM_PIPELINES = { - 'src.pipelines.TitleItemPipeline': 123, -} -SPIDER_MIDDLEWARES = { - 'src.middlewares.TitleSpiderMiddleware': 543, -} -DOWNLOADER_MIDDLEWARES = { - 'src.middlewares.TitleDownloaderMiddleware': 543, -} diff --git a/docs/02_guides/code/scrapy_project/src/spiders/title.py b/docs/02_guides/code/scrapy_project/src/spiders.py similarity index 94% rename from docs/02_guides/code/scrapy_project/src/spiders/title.py rename to docs/02_guides/code/scrapy_project/src/spiders.py index d3bced34..a5e1f473 100644 --- a/docs/02_guides/code/scrapy_project/src/spiders/title.py +++ b/docs/02_guides/code/scrapy_project/src/spiders.py @@ -1,5 +1,3 @@ -# ruff: noqa: TID252, RUF012 - from __future__ import annotations from typing import TYPE_CHECKING, Any @@ -7,7 +5,7 @@ from scrapy import Request, Spider -from ..items import TitleItem +from .items import TitleItem if TYPE_CHECKING: from collections.abc import Generator @@ -53,7 +51,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None response: The web page response. Yields: - Yields scraped TitleItem and Requests for links. + Yields scraped `TitleItem` and new `Request` objects for links. """ self.logger.info('TitleSpider is parsing %s...', response) diff --git a/docs/02_guides/code/scrapy_project/src/spiders/__init__.py b/docs/02_guides/code/scrapy_project/src/spiders/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/02_guides/code/scrapy_project/src/spiders/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/pyproject.toml b/pyproject.toml index bb27ebb2..7c1205bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,6 +136,16 @@ indent-style = "space" "TRY301", # Abstract `raise` to an inner function "PLW0603", # Using the global statement to update `{name}` is discouraged ] +"**/docs/**/scrapy_project/**/__main__.py" = [ + # Because of asyncioreactor.install() call. + "E402", # Module level import not at top of file +] +"**/docs/**/scrapy_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted + # Class variables are common in Scrapy projects. + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py index df98b0ff..166e8b17 100644 --- a/src/apify/scrapy/_async_thread.py +++ b/src/apify/scrapy/_async_thread.py @@ -48,12 +48,12 @@ def run_coro( The result returned by the coroutine. Raises: + RuntimeError: If the event loop is not running. TimeoutError: If the coroutine does not complete within the timeout. Exception: Any exception raised during coroutine execution. """ if not self._eventloop.is_running(): - logger.warning('Event loop is not running! Ignoring coroutine execution.') - return None + raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.') # Submit the coroutine to the event loop running in the other thread. future = asyncio.run_coroutine_threadsafe(coro, self._eventloop) @@ -104,8 +104,6 @@ async def _shutdown_tasks(self) -> None: """Cancel all pending tasks in the event loop.""" # Retrieve all tasks for the event loop, excluding the current task. tasks = [task for task in asyncio.all_tasks(self._eventloop) if task is not asyncio.current_task()] - if not tasks: - return # Cancel each pending task. for task in tasks: diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index d658efeb..a243a368 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -22,7 +22,7 @@ class ApifyScheduler(BaseScheduler): - """A Scrapy scheduler that uses the Apify request queue to manage requests. + """A Scrapy scheduler that uses the Apify `RequestQueue` to manage requests. This scheduler requires the asyncio Twisted reactor to be installed. """ diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index cbea845d..a0ca2fa0 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -3,6 +3,12 @@ from crawlee._utils.crypto import crypto_random_object_id +def read_file(file_path: str) -> str: + """Read the content of a file and return it as a string.""" + with open(file_path, encoding='utf-8') as file: + return file.read() + + def generate_unique_resource_name(label: str) -> str: """Generates a unique resource name, which will contain the given label.""" label = label.replace('_', '-') diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 9a74924a..6e9682b3 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -349,14 +349,14 @@ def __call__( @pytest.fixture -async def run_actor(apify_client_async: ApifyClientAsync) -> RunActorFunction: +async def run_actor(apify_client_async: ApifyClientAsync, run_input: Any = None) -> RunActorFunction: """Fixture for calling an Actor run and waiting for its completion. This fixture returns a function that initiates an Actor run with optional run input, waits for its completion, and retrieves the final result. It uses the `wait_for_finish` method with a timeout of 10 minutes. """ - async def _run_actor(actor: ActorClientAsync, *, run_input: Any = None) -> ActorRun: + async def _run_actor(actor: ActorClientAsync, *, run_input: Any = run_input) -> ActorRun: call_result = await actor.call(run_input=run_input) assert isinstance(call_result, dict), 'The result of ActorClientAsync.call() is not a dictionary.' diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index 52a03188..19a3d5f6 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -5,125 +5,31 @@ if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction +from ._utils import read_file + async def test_actor_scrapy_title_spider( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: actor_source_files = { - 'src/spiders/title.py': """ - from __future__ import annotations - from typing import TYPE_CHECKING, Any - from urllib.parse import urljoin - from scrapy import Request, Spider - from ..items import TitleItem - - if TYPE_CHECKING: - from collections.abc import Generator - from scrapy.responsetypes import Response - - - class TitleSpider(Spider): - name = 'title_spider' - - # Limit the number of pages to scrape. - custom_settings = {'CLOSESPIDER_PAGECOUNT': 10} - - def __init__( - self, - start_urls: list[str], - allowed_domains: list[str], - *args: Any, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) - self.start_urls = start_urls - self.allowed_domains = allowed_domains - - def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: - self.logger.info('TitleSpider is parsing %s...', response) - url = response.url - title = response.css('title::text').extract_first() - yield TitleItem(url=url, title=title) - - for link_href in response.css('a::attr("href")'): - link_url = urljoin(response.url, link_href.get()) - if link_url.startswith(('http://', 'https://')): - yield Request(link_url) - """, - 'src/spiders/__init__.py': """ - from .title import TitleSpider - """, - 'src/items.py': """ - import scrapy - - class TitleItem(scrapy.Item): - url = scrapy.Field() - title = scrapy.Field() - """, - 'src/settings.py': """ - BOT_NAME = 'titlebot' - DEPTH_LIMIT = 1 - LOG_LEVEL = 'INFO' - NEWSPIDER_MODULE = 'src.spiders' - ROBOTSTXT_OBEY = True - SPIDER_MODULES = ['src.spiders'] - TELNETCONSOLE_ENABLED = False - TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' - """, - 'src/__init__.py': '', - 'src/main.py': """ - from __future__ import annotations - from scrapy.crawler import CrawlerRunner - from scrapy.utils.defer import deferred_to_future - from apify import Actor - from apify.scrapy.utils import apply_apify_settings - from .spiders.title import TitleSpider as Spider - - - async def main() -> None: - async with Actor: - Actor.log.info('Actor is being executed...') - - # Retrieve and process Actor input. - start_urls = ['https://crawlee.dev'] - allowed_domains = ['crawlee.dev'] - proxy_config = {'useApifyProxy': True} - - # Apply Apify settings, which will override the Scrapy project settings. - settings = apply_apify_settings(proxy_config=proxy_config) - - # Create CrawlerRunner and execute the Scrapy spider. - crawler_runner = CrawlerRunner(settings) - crawl_deferred = crawler_runner.crawl( - Spider, - start_urls=start_urls, - allowed_domains=allowed_domains, - ) - await deferred_to_future(crawl_deferred) - """, - 'src/__main__.py': """ - from __future__ import annotations - from twisted.internet import asyncioreactor - - # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. - asyncioreactor.install() - - import os - from apify.scrapy import initialize_logging, run_scrapy_actor - from .main import main - - # Ensure the location to the Scrapy settings module is defined. - os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' - - if __name__ == '__main__': - initialize_logging() - run_scrapy_actor(main()) - """, + 'src/spiders.py': read_file('docs/02_guides/code/scrapy_project/src/spiders.py'), + 'src/items.py': read_file('docs/02_guides/code/scrapy_project/src/items.py'), + 'src/settings.py': read_file('docs/02_guides/code/scrapy_project/src/settings.py'), + 'src/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/__init__.py'), + 'src/main.py': read_file('docs/02_guides/code/scrapy_project/src/main.py'), + 'src/__main__.py': read_file('docs/02_guides/code/scrapy_project/src/__main__.py'), } actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files) - run_result = await run_actor(actor) + run_result = await run_actor( + actor, + run_input={ + 'startUrls': [{'url': 'https://crawlee.dev'}], + 'allowedDomains': ['crawlee.dev'], + 'proxyConfiguration': {'useApifyProxy': True}, + }, + ) assert run_result.status == 'SUCCEEDED' From f99a9eb7c31f9d446106d518226fca5d766db39a Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Mon, 10 Feb 2025 15:06:40 +0100 Subject: [PATCH 10/16] polishment --- docs/02_guides/05_scrapy.mdx | 14 +++++++------- .../code/scrapy_project/src/spiders/__init__.py | 1 + .../code/scrapy_project/src/spiders/py.typed | 0 .../src/{spiders.py => spiders/title.py} | 2 +- pyproject.toml | 2 ++ tests/integration/test_actor_scrapy.py | 9 +++++---- 6 files changed, 16 insertions(+), 12 deletions(-) create mode 100644 docs/02_guides/code/scrapy_project/src/spiders/__init__.py create mode 100644 docs/02_guides/code/scrapy_project/src/spiders/py.typed rename docs/02_guides/code/scrapy_project/src/{spiders.py => spiders/title.py} (98%) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 250d7953..d3aafdad 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem'; import UnderscoreMainExample from '!!raw-loader!./code/scrapy_project/src/__main__.py'; import MainExample from '!!raw-loader!./code/scrapy_project/src/main.py'; import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; -import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders.py'; +import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; [Scrapy](https://scrapy.org/) is an open-source web scraping framework for Python. It provides tools for defining scrapers, extracting data from web pages, following links, and handling pagination. With the Apify SDK, Scrapy projects can be converted into Apify [Actors](https://docs.apify.com/platform/actors), integrated with Apify [storages](https://docs.apify.com/platform/storage), and executed on the Apify [platform](https://docs.apify.com/platform). @@ -75,19 +75,19 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e {MainExample} </CodeBlock> </TabItem> - <TabItem value="items.py" label="items.py"> + <TabItem value="settings.py" label="settings.py"> <CodeBlock className="language-python"> - {ItemsExample} + {SettingsExample} </CodeBlock> </TabItem> - <TabItem value="spiders.py" label="spiders.py"> + <TabItem value="items.py" label="items.py"> <CodeBlock className="language-python"> - {SpidersExample} + {ItemsExample} </CodeBlock> </TabItem> - <TabItem value="settings.py" label="settings.py"> + <TabItem value="spiders/title.py" label="spiders/title.py"> <CodeBlock className="language-python"> - {SettingsExample} + {SpidersExample} </CodeBlock> </TabItem> </Tabs> diff --git a/docs/02_guides/code/scrapy_project/src/spiders/__init__.py b/docs/02_guides/code/scrapy_project/src/spiders/__init__.py new file mode 100644 index 00000000..f63ac977 --- /dev/null +++ b/docs/02_guides/code/scrapy_project/src/spiders/__init__.py @@ -0,0 +1 @@ +from .title import TitleSpider diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/02_guides/code/scrapy_project/src/spiders/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/scrapy_project/src/spiders.py b/docs/02_guides/code/scrapy_project/src/spiders/title.py similarity index 98% rename from docs/02_guides/code/scrapy_project/src/spiders.py rename to docs/02_guides/code/scrapy_project/src/spiders/title.py index a5e1f473..ed54b3c3 100644 --- a/docs/02_guides/code/scrapy_project/src/spiders.py +++ b/docs/02_guides/code/scrapy_project/src/spiders/title.py @@ -5,7 +5,7 @@ from scrapy import Request, Spider -from .items import TitleItem +from ..items import TitleItem if TYPE_CHECKING: from collections.abc import Generator diff --git a/pyproject.toml b/pyproject.toml index 7c1205bb..f479bb84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,8 @@ indent-style = "space" "I001", # Import block is un-sorted or un-formatted # Class variables are common in Scrapy projects. "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + # Local imports in Scrapy project. + "TID252", # Prefer absolute imports over relative imports from parent modules ] [tool.ruff.lint.flake8-quotes] diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index 19a3d5f6..a109fa8e 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -13,12 +13,13 @@ async def test_actor_scrapy_title_spider( run_actor: RunActorFunction, ) -> None: actor_source_files = { - 'src/spiders.py': read_file('docs/02_guides/code/scrapy_project/src/spiders.py'), - 'src/items.py': read_file('docs/02_guides/code/scrapy_project/src/items.py'), - 'src/settings.py': read_file('docs/02_guides/code/scrapy_project/src/settings.py'), 'src/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/__init__.py'), - 'src/main.py': read_file('docs/02_guides/code/scrapy_project/src/main.py'), 'src/__main__.py': read_file('docs/02_guides/code/scrapy_project/src/__main__.py'), + 'src/items.py': read_file('docs/02_guides/code/scrapy_project/src/items.py'), + 'src/main.py': read_file('docs/02_guides/code/scrapy_project/src/main.py'), + 'src/settings.py': read_file('docs/02_guides/code/scrapy_project/src/settings.py'), + 'src/spiders/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/spiders/__init__.py'), + 'src/spiders/title.py': read_file('docs/02_guides/code/scrapy_project/src/spiders/title.py'), } actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files) From 7f2242dd62d13c46a9d1a09416d11c743ae1f6ea Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Mon, 10 Feb 2025 16:52:28 +0100 Subject: [PATCH 11/16] do not run sys.exit for scrapy --- src/apify/_actor.py | 6 ++++++ .../integration/actor_source_base/requirements.txt | 1 - tests/integration/conftest.py | 13 +++++++++++++ tests/integration/test_actor_scrapy.py | 6 +++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 26fb0ce8..df353f32 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -4,6 +4,7 @@ import os import sys from datetime import timedelta +from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar, cast, overload from lazy_object_proxy import Proxy @@ -266,10 +267,15 @@ async def finalize() -> None: await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False + # Find out if Scrapy is installed. + scrapy_installed = find_spec('scrapy') is not None + if is_running_in_ipython(): self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') + elif scrapy_installed: + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in Scrapy') else: sys.exit(exit_code) diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index cd19947d..0df1ff38 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,3 +1,2 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER -scrapy~=2.12.0 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 6e9682b3..a01422d2 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -198,6 +198,7 @@ def __call__( main_func: Callable | None = None, main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, + additional_requirements: list[str] | None = None, ) -> Awaitable[ActorClientAsync]: """Create a temporary Actor from the given main function or source files. @@ -211,6 +212,7 @@ def __call__( main_func: The main function of the Actor. main_py: The `src/main.py` file of the Actor. source_files: A dictionary of the source files of the Actor. + additional_requirements: A list of additional requirements to be added to the `requirements.txt`. Returns: A resource client for the created Actor. @@ -235,6 +237,7 @@ async def _make_actor( main_func: Callable | None = None, main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, + additional_requirements: list[str] | None = None, ) -> ActorClientAsync: if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -270,6 +273,16 @@ async def _make_actor( actor_source_files = actor_base_source_files.copy() actor_source_files.update(source_files) + if additional_requirements: + # Get the current requirements.txt content (as a string). + req_content = actor_source_files.get('requirements.txt', '') + if isinstance(req_content, bytes): + req_content = req_content.decode('utf-8') + # Append the additional requirements, each on a new line. + additional_reqs = '\n'.join(additional_requirements) + req_content = req_content.strip() + '\n' + additional_reqs + '\n' + actor_source_files['requirements.txt'] = req_content + # Reformat the source files in a format that the Apify API understands. source_files_for_api = [] for file_name, file_contents in actor_source_files.items(): diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index a109fa8e..ba43101a 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -22,7 +22,11 @@ async def test_actor_scrapy_title_spider( 'src/spiders/title.py': read_file('docs/02_guides/code/scrapy_project/src/spiders/title.py'), } - actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files) + actor = await make_actor( + 'actor-scrapy-title-spider', + source_files=actor_source_files, + additional_requirements=['scrapy~=2.12.0'], + ) run_result = await run_actor( actor, run_input={ From 57086f5c8944b6c8cc06f53d307e5d0fe295df06 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Mon, 10 Feb 2025 18:05:44 +0100 Subject: [PATCH 12/16] Fix RQ stuck in infinite loop due to ID mismatch Closes: #392 --- src/apify/scrapy/requests.py | 99 +++++++++++++++--------------------- 1 file changed, 42 insertions(+), 57 deletions(-) diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index b4d3fd83..6a4badc4 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -5,27 +5,18 @@ from logging import getLogger from typing import Any, cast -from scrapy import Request, Spider +from scrapy import Request as ScrapyRequest +from scrapy import Spider from scrapy.http.headers import Headers from scrapy.utils.request import request_from_dict -from crawlee import Request as CrawleeRequest +from crawlee import Request as ApifyRequest from crawlee._types import HttpHeaders -from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id logger = getLogger(__name__) -def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: - """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False. - - Works for RetryMiddleware and RedirectMiddleware. - """ - return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times')) - - -def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None: +def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None: """Convert a Scrapy request to an Apify request. Args: @@ -35,51 +26,45 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest Returns: The converted Apify request if the conversion was successful, otherwise None. """ - if not isinstance(scrapy_request, Request): - logger.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.') # type: ignore[unreachable] + if not isinstance(scrapy_request, ScrapyRequest): + logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.') # type: ignore[unreachable] return None logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...') + # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter. + request_kwargs: dict[str, Any] = { + 'url': scrapy_request.url, + 'method': scrapy_request.method, + 'payload': scrapy_request.body, + 'use_extended_unique_key': True, + 'keep_url_fragment': False, + } + try: - if _is_request_produced_by_middleware(scrapy_request): - unique_key = compute_unique_key( - url=scrapy_request.url, - method=scrapy_request.method, # type: ignore[arg-type] # str vs literal - payload=scrapy_request.body, - use_extended_unique_key=True, - ) - elif scrapy_request.dont_filter: - unique_key = crypto_random_object_id(8) - elif scrapy_request.meta.get('apify_request_unique_key'): - unique_key = scrapy_request.meta['apify_request_unique_key'] + if scrapy_request.dont_filter: + request_kwargs['always_enqueue'] = True else: - unique_key = crypto_random_object_id(8) + if scrapy_request.meta.get('apify_request_unique_key'): + request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key'] - if scrapy_request.meta.get('apify_request_id'): - request_id = scrapy_request.meta['apify_request_id'] - else: - request_id = unique_key_to_request_id(unique_key) - - apify_request = CrawleeRequest( - url=scrapy_request.url, - method=scrapy_request.method, - payload=scrapy_request.body, - user_data=scrapy_request.meta.get('userData', {}), - unique_key=unique_key, - id=request_id, - ) + if scrapy_request.meta.get('apify_request_id'): + request_kwargs['id'] = scrapy_request.meta['apify_request_id'] + + request_kwargs['user_data'] = scrapy_request.meta.get('userData', {}) # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request if isinstance(scrapy_request.headers, Headers): - apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict())) + request_kwargs['headers'] = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict())) else: logger.warning( # type: ignore[unreachable] f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}' ) - # Serialize the Scrapy Request and store it in the apify_request. - # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64, + apify_request = ApifyRequest.from_url(**request_kwargs) + + # Serialize the Scrapy ScrapyRequest and store it in the apify_request. + # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64, # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request. # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/. scrapy_request_dict = scrapy_request.to_dict(spider=spider) @@ -94,7 +79,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest return apify_request -def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: +def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest: """Convert an Apify request to a Scrapy request. Args: @@ -102,23 +87,23 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: spider: The Scrapy spider that the request is associated with. Raises: - TypeError: If the apify_request is not a crawlee request. - ValueError: If the apify_request does not contain the required keys. + TypeError: If the Apify request is not an instance of the `ApifyRequest` class. + ValueError: If the Apify request does not contain the required keys. Returns: The converted Scrapy request. """ - if not isinstance(cast(Any, apify_request), CrawleeRequest): - raise TypeError('apify_request must be a crawlee.Request instance') + if not isinstance(cast(Any, apify_request), ApifyRequest): + raise TypeError('apify_request must be a crawlee.ScrapyRequest instance') logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...') # If the apify_request comes from the Scrapy if 'scrapy_request' in apify_request.user_data: - # Deserialize the Scrapy Request from the apify_request. + # Deserialize the Scrapy ScrapyRequest from the apify_request. # - This process involves decoding the base64-encoded request data and reconstructing - # the Scrapy Request object from its dictionary representation. - logger.debug('Restoring the Scrapy Request from the apify_request...') + # the Scrapy ScrapyRequest object from its dictionary representation. + logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...') scrapy_request_dict_encoded = apify_request.user_data['scrapy_request'] if not isinstance(scrapy_request_dict_encoded, str): @@ -129,10 +114,10 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: raise TypeError('scrapy_request_dict must be a dictionary') scrapy_request = request_from_dict(scrapy_request_dict, spider=spider) - if not isinstance(scrapy_request, Request): - raise TypeError('scrapy_request must be an instance of the Request class') + if not isinstance(scrapy_request, ScrapyRequest): + raise TypeError('scrapy_request must be an instance of the ScrapyRequest class') - logger.debug(f'Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...') + logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...') # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} @@ -140,11 +125,11 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: # scrapy_request.meta is a property, so we have to set it like this scrapy_request._meta = meta # noqa: SLF001 - # If the apify_request comes directly from the Request Queue, typically start URLs + # If the apify_request comes directly from the Scrapy, typically start URLs. else: - logger.debug('Gonna create a new Scrapy Request (cannot be restored)') + logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)') - scrapy_request = Request( + scrapy_request = ScrapyRequest( url=apify_request.url, method=apify_request.method, meta={ From 8646c068276885bb493b10b706b1ac21dc578a9e Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Tue, 11 Feb 2025 09:22:38 +0100 Subject: [PATCH 13/16] Address Honza's feedback --- tests/integration/_utils.py | 6 ------ tests/integration/conftest.py | 4 ++-- tests/integration/test_actor_scrapy.py | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index a0ca2fa0..cbea845d 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -3,12 +3,6 @@ from crawlee._utils.crypto import crypto_random_object_id -def read_file(file_path: str) -> str: - """Read the content of a file and return it as a string.""" - with open(file_path, encoding='utf-8') as file: - return file.read() - - def generate_unique_resource_name(label: str) -> str: """Generates a unique resource name, which will contain the given label.""" label = label.replace('_', '-') diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a01422d2..96052805 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -362,14 +362,14 @@ def __call__( @pytest.fixture -async def run_actor(apify_client_async: ApifyClientAsync, run_input: Any = None) -> RunActorFunction: +async def run_actor(apify_client_async: ApifyClientAsync) -> RunActorFunction: """Fixture for calling an Actor run and waiting for its completion. This fixture returns a function that initiates an Actor run with optional run input, waits for its completion, and retrieves the final result. It uses the `wait_for_finish` method with a timeout of 10 minutes. """ - async def _run_actor(actor: ActorClientAsync, *, run_input: Any = run_input) -> ActorRun: + async def _run_actor(actor: ActorClientAsync, *, run_input: Any = None) -> ActorRun: call_result = await actor.call(run_input=run_input) assert isinstance(call_result, dict), 'The result of ActorClientAsync.call() is not a dictionary.' diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index ba43101a..5cad9ad6 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -1,25 +1,26 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction -from ._utils import read_file - async def test_actor_scrapy_title_spider( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: + base_path = Path('docs/02_guides/code/scrapy_project') + actor_source_files = { - 'src/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/__init__.py'), - 'src/__main__.py': read_file('docs/02_guides/code/scrapy_project/src/__main__.py'), - 'src/items.py': read_file('docs/02_guides/code/scrapy_project/src/items.py'), - 'src/main.py': read_file('docs/02_guides/code/scrapy_project/src/main.py'), - 'src/settings.py': read_file('docs/02_guides/code/scrapy_project/src/settings.py'), - 'src/spiders/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/spiders/__init__.py'), - 'src/spiders/title.py': read_file('docs/02_guides/code/scrapy_project/src/spiders/title.py'), + 'src/__init__.py': (base_path / 'src/__init__.py').read_text(), + 'src/__main__.py': (base_path / 'src/__main__.py').read_text(), + 'src/items.py': (base_path / 'src/items.py').read_text(), + 'src/main.py': (base_path / 'src/main.py').read_text(), + 'src/settings.py': (base_path / 'src/settings.py').read_text(), + 'src/spiders/__init__.py': (base_path / 'src/spiders/__init__.py').read_text(), + 'src/spiders/title.py': (base_path / 'src/spiders/title.py').read_text(), } actor = await make_actor( From 4a9665ae96e321bcedc11e525c7eb4c01e7cdf51 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Tue, 11 Feb 2025 13:50:34 +0100 Subject: [PATCH 14/16] utilize SCRAPY_SETTINGS_MODULE env var --- src/apify/_actor.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index df353f32..d675f1bd 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -4,7 +4,6 @@ import os import sys from datetime import timedelta -from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar, cast, overload from lazy_object_proxy import Proxy @@ -267,15 +266,12 @@ async def finalize() -> None: await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False - # Find out if Scrapy is installed. - scrapy_installed = find_spec('scrapy') is not None - if is_running_in_ipython(): self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') - elif scrapy_installed: - self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in Scrapy') + elif os.getenv('SCRAPY_SETTINGS_MODULE'): + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy') else: sys.exit(exit_code) From f1e8bd5e0478fb164a12fefb63e6a9c74d161a67 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Tue, 11 Feb 2025 16:00:21 +0100 Subject: [PATCH 15/16] allow ajax crawl middleware --- src/apify/scrapy/utils.py | 4 ---- tests/unit/scrapy/utils/test_apply_apify_settings.py | 1 - 2 files changed, 5 deletions(-) diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 894cda95..860c1c33 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -40,10 +40,6 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict # ensuring it is executed as the final step in the pipeline sequence settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000 - # Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request - # during process_response, but currently we have no way of detecting it and handling it properly. - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None - # Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750 diff --git a/tests/unit/scrapy/utils/test_apply_apify_settings.py b/tests/unit/scrapy/utils/test_apply_apify_settings.py index 64e67a24..6c5227c0 100644 --- a/tests/unit/scrapy/utils/test_apply_apify_settings.py +++ b/tests/unit/scrapy/utils/test_apply_apify_settings.py @@ -42,7 +42,6 @@ def test_updates_downloader_middlewares() -> None: assert new_settings.get('DOWNLOADER_MIDDLEWARES') == { 'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 750, - 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': None, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123, From 413a56a01aadf704d9f22f9f7a72d56cd905adf9 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Wed, 12 Feb 2025 18:40:41 +0100 Subject: [PATCH 16/16] mention SCRAPY_SETTINGS_MODULE env var --- docs/02_guides/05_scrapy.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index d3aafdad..35b6bb5e 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -25,6 +25,8 @@ The Apify SDK provides an Apify-Scrapy integration. The main challenge of this i In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed. +Make sure the `SCRAPY_SETTINGS_MODULE` environment variable is set to the path of the Scrapy settings module. This variable is also used by the `Actor` class to detect that the project is a Scrapy project, triggering additional actions. + <CodeBlock className="language-python" title="main.py: The Actor main coroutine"> {MainExample} </CodeBlock>