11"""Apify Actor integration for Scrapy projects.
22
3- This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
4- logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
3+ This module transforms a Scrapy project into an Apify Actor, handling the configuration
4+ of logging, patching Scrapy's logging system, and establishing the required environment
5+ to run the Scrapy spider within the Apify platform.
56
6- This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
7- or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
8- `scrapy crawl title_spider`.
7+ This file is specifically designed to be executed when the project is run as an Apify
8+ Actor using `apify run` locally or being run on the Apify platform. It is not being
9+ executed when running the project as a Scrapy project using `scrapy crawl title_spider`.
910
1011We recommend you do not modify this file unless you really know what you are doing.
1112"""
1213
1314# ruff: noqa: E402
1415
15- # We need to configure the logging first before we import anything else, so that nothing else imports
16- # `scrapy.utils.log` before we patch it.
16+ # We need to configure the logging first before we import anything else, so that nothing
17+ # else imports `scrapy.utils.log` before we patch it.
1718from __future__ import annotations
1819
1920from logging import StreamHandler , getLogger
2930OTHER_LOGGER_NAMES = ['filelock' , 'hpack' , 'httpcore' , 'httpx' , 'protego' , 'twisted' ]
3031ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
3132
32- # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
33- # Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
34- # a specific logger, do it in this file.
33+ # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the
34+ # field is not present in the file, Scrapy will default to `DEBUG`. This setting applies
35+ # to all loggers. If you wish to change the logging level for a specific logger,
36+ # do it in this file.
3537settings = get_project_settings ()
3638LOGGING_LEVEL = settings ['LOG_LEVEL' ]
3739
4042apify_handler .setFormatter (ActorLogFormatter (include_logger_name = True ))
4143
4244
43- def configure_logger (logger_name : str | None , log_level : str , * handlers : StreamHandler ) -> None :
45+ def configure_logger (
46+ logger_name : str | None , log_level : str , * handlers : StreamHandler
47+ ) -> None :
4448 """Configure a logger with the specified settings.
4549
4650 Args:
@@ -56,41 +60,46 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
5660 logger .addHandler (handler )
5761
5862
59- # Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
63+ # Apify loggers have to be set up here and in the `new_configure_logging` as well to be
64+ # able to use them both from
6065# the `main.py` and Scrapy components.
6166for logger_name in MAIN_LOGGER_NAMES :
6267 configure_logger (logger_name , LOGGING_LEVEL , apify_handler )
6368
64- # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
65- # call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
66- # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
67- # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
68- # otherwise we would lose some log messages.
69+ # We can't attach our log handler to the loggers normally, because Scrapy would remove
70+ # them in the `configure_logging` call here:
71+ # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
72+ # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's
73+ # `configure_logging` method like this, so that our handler is attached right after
74+ # Scrapy calls the `configure_logging` method, because otherwise we would lose some log
75+ # messages.
6976old_configure_logging = scrapy_logging .configure_logging
7077
7178
7279def new_configure_logging (* args : Any , ** kwargs : Any ) -> None :
73- """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
80+ """Configure logging for Scrapy and root loggers to ensure consistent log behavior.
7481
75- We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
76- logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
77- loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
82+ We need to manually configure both the root logger and all Scrapy-associated loggers.
83+ Configuring only the root logger is not sufficient, as Scrapy will override it with
84+ its own settings. Scrapy uses these four primary loggers:
85+ https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore,
7886 these four loggers and the root logger.
7987 """
8088 old_configure_logging (* args , ** kwargs )
8189
82- # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
83- # property within spiders. See details in the Spider logger property:
90+ # We modify the root (None) logger to ensure proper display of logs from spiders when
91+ # using the `self.logger` property within spiders. See details in the Spider logger
92+ # property:
8493 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
8594 configure_logger (None , LOGGING_LEVEL , apify_handler )
8695
87- # We modify other loggers only by setting up their log level. A custom log handler is added
88- # only to the root logger to avoid duplicate log messages.
96+ # We modify other loggers only by setting up their log level. A custom log handler
97+ # is added only to the root logger to avoid duplicate log messages.
8998 for logger_name in ALL_LOGGER_NAMES :
9099 configure_logger (logger_name , LOGGING_LEVEL )
91100
92- # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
93- # messages, especially when running on the platform.
101+ # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose
102+ # and spams the logs with useless messages, especially when running on the platform.
94103 configure_logger ('httpx' , 'WARNING' )
95104
96105
@@ -105,12 +114,11 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
105114
106115from .main import main
107116
108- # For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
109- # necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
110- # to work together.
111- #
112- # Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
113- # on Windows.
117+ # For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify)
118+ # asynchronous libraries, it is necessary to set the Twisted reactor to
119+ # `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
120+ # to work together. Note: The reactor must be installed before applying
121+ # `nest_asyncio.apply()`, otherwise, it will not work correctly on Windows.
114122install_reactor ('twisted.internet.asyncioreactor.AsyncioSelectorReactor' )
115123nest_asyncio .apply ()
116124
0 commit comments