1
1
"""Apify Actor integration for Scrapy projects.
2
2
3
- This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
4
- logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
3
+ This module transforms a Scrapy project into an Apify Actor, handling the configuration
4
+ of logging, patching Scrapy's logging system, and establishing the required environment
5
+ to run the Scrapy spider within the Apify platform.
5
6
6
- This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
7
- or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
8
- `scrapy crawl title_spider`.
7
+ This file is specifically designed to be executed when the project is run as an Apify
8
+ Actor using `apify run` locally or being run on the Apify platform. It is not being
9
+ executed when running the project as a Scrapy project using `scrapy crawl title_spider`.
9
10
10
11
We recommend you do not modify this file unless you really know what you are doing.
11
12
"""
12
13
13
14
# ruff: noqa: E402
14
15
15
- # We need to configure the logging first before we import anything else, so that nothing else imports
16
- # `scrapy.utils.log` before we patch it.
16
+ # We need to configure the logging first before we import anything else, so that nothing
17
+ # else imports `scrapy.utils.log` before we patch it.
17
18
from __future__ import annotations
18
19
19
20
from logging import StreamHandler , getLogger
29
30
OTHER_LOGGER_NAMES = ['filelock' , 'hpack' , 'httpcore' , 'httpx' , 'protego' , 'twisted' ]
30
31
ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
31
32
32
- # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
33
- # Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
34
- # a specific logger, do it in this file.
33
+ # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the
34
+ # field is not present in the file, Scrapy will default to `DEBUG`. This setting applies
35
+ # to all loggers. If you wish to change the logging level for a specific logger,
36
+ # do it in this file.
35
37
settings = get_project_settings ()
36
38
LOGGING_LEVEL = settings ['LOG_LEVEL' ]
37
39
40
42
apify_handler .setFormatter (ActorLogFormatter (include_logger_name = True ))
41
43
42
44
43
- def configure_logger (logger_name : str | None , log_level : str , * handlers : StreamHandler ) -> None :
45
+ def configure_logger (
46
+ logger_name : str | None , log_level : str , * handlers : StreamHandler
47
+ ) -> None :
44
48
"""Configure a logger with the specified settings.
45
49
46
50
Args:
@@ -56,41 +60,46 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
56
60
logger .addHandler (handler )
57
61
58
62
59
- # Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
63
+ # Apify loggers have to be set up here and in the `new_configure_logging` as well to be
64
+ # able to use them both from
60
65
# the `main.py` and Scrapy components.
61
66
for logger_name in MAIN_LOGGER_NAMES :
62
67
configure_logger (logger_name , LOGGING_LEVEL , apify_handler )
63
68
64
- # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
65
- # call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
66
- # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
67
- # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
68
- # otherwise we would lose some log messages.
69
+ # We can't attach our log handler to the loggers normally, because Scrapy would remove
70
+ # them in the `configure_logging` call here:
71
+ # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
72
+ # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's
73
+ # `configure_logging` method like this, so that our handler is attached right after
74
+ # Scrapy calls the `configure_logging` method, because otherwise we would lose some log
75
+ # messages.
69
76
old_configure_logging = scrapy_logging .configure_logging
70
77
71
78
72
79
def new_configure_logging (* args : Any , ** kwargs : Any ) -> None :
73
- """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
80
+ """Configure logging for Scrapy and root loggers to ensure consistent log behavior.
74
81
75
- We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
76
- logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
77
- loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
82
+ We need to manually configure both the root logger and all Scrapy-associated loggers.
83
+ Configuring only the root logger is not sufficient, as Scrapy will override it with
84
+ its own settings. Scrapy uses these four primary loggers:
85
+ https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore,
78
86
these four loggers and the root logger.
79
87
"""
80
88
old_configure_logging (* args , ** kwargs )
81
89
82
- # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
83
- # property within spiders. See details in the Spider logger property:
90
+ # We modify the root (None) logger to ensure proper display of logs from spiders when
91
+ # using the `self.logger` property within spiders. See details in the Spider logger
92
+ # property:
84
93
# https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
85
94
configure_logger (None , LOGGING_LEVEL , apify_handler )
86
95
87
- # We modify other loggers only by setting up their log level. A custom log handler is added
88
- # only to the root logger to avoid duplicate log messages.
96
+ # We modify other loggers only by setting up their log level. A custom log handler
97
+ # is added only to the root logger to avoid duplicate log messages.
89
98
for logger_name in ALL_LOGGER_NAMES :
90
99
configure_logger (logger_name , LOGGING_LEVEL )
91
100
92
- # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
93
- # messages, especially when running on the platform.
101
+ # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose
102
+ # and spams the logs with useless messages, especially when running on the platform.
94
103
configure_logger ('httpx' , 'WARNING' )
95
104
96
105
@@ -105,12 +114,11 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
105
114
106
115
from .main import main
107
116
108
- # For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
109
- # necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
110
- # to work together.
111
- #
112
- # Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
113
- # on Windows.
117
+ # For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify)
118
+ # asynchronous libraries, it is necessary to set the Twisted reactor to
119
+ # `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
120
+ # to work together. Note: The reactor must be installed before applying
121
+ # `nest_asyncio.apply()`, otherwise, it will not work correctly on Windows.
114
122
install_reactor ('twisted.internet.asyncioreactor.AsyncioSelectorReactor' )
115
123
nest_asyncio .apply ()
116
124
0 commit comments