Skip to content

Commit 4fb959e

Browse files
committed
Set line length to docs related code to 90
Update existing examples to be compliant.
1 parent 42ae0b8 commit 4fb959e

File tree

13 files changed

+113
-61
lines changed

13 files changed

+113
-61
lines changed

docs/01_overview/code/01_introduction.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@ async def main() -> None:
1010
async with httpx.AsyncClient() as client:
1111
response = await client.get(actor_input['url'])
1212
soup = BeautifulSoup(response.content, 'html.parser')
13-
data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None}
13+
data = {
14+
'url': actor_input['url'],
15+
'title': soup.title.string if soup.title else None,
16+
}
1417
await Actor.push_data(data)

docs/02_guides/code/02_crawlee_beautifulsoup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ async def main() -> None:
2525

2626
# Create a crawler.
2727
crawler = BeautifulSoupCrawler(
28-
# Limit the crawl to max requests. Remove or increase it for crawling all links.
28+
# Limit the crawl to max requests.
29+
# Remove or increase it for crawling all links.
2930
max_requests_per_crawl=50,
3031
)
3132

docs/02_guides/code/02_crawlee_playwright.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ async def main() -> None:
2525

2626
# Create a crawler.
2727
crawler = PlaywrightCrawler(
28-
# Limit the crawl to max requests. Remove or increase it for crawling all links.
28+
# Limit the crawl to max requests.
29+
# Remove or increase it for crawling all links.
2930
max_requests_per_crawl=50,
3031
headless=True,
3132
browser_launch_options={
@@ -43,9 +44,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
4344
data = {
4445
'url': context.request.url,
4546
'title': await context.page.title(),
46-
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
47-
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
48-
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
47+
'h1s': [
48+
await h1.text_content()
49+
for h1 in await context.page.locator('h1').all()
50+
],
51+
'h2s': [
52+
await h2.text_content()
53+
for h2 in await context.page.locator('h2').all()
54+
],
55+
'h3s': [
56+
await h3.text_content()
57+
for h3 in await context.page.locator('h3').all()
58+
],
4959
}
5060

5161
# Store the extracted data to the default dataset.

docs/02_guides/code/scrapy_src/__main__.py

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
"""Apify Actor integration for Scrapy projects.
22
3-
This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
4-
logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
3+
This module transforms a Scrapy project into an Apify Actor, handling the configuration
4+
of logging, patching Scrapy's logging system, and establishing the required environment
5+
to run the Scrapy spider within the Apify platform.
56
6-
This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
7-
or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
8-
`scrapy crawl title_spider`.
7+
This file is specifically designed to be executed when the project is run as an Apify
8+
Actor using `apify run` locally or being run on the Apify platform. It is not being
9+
executed when running the project as a Scrapy project using `scrapy crawl title_spider`.
910
1011
We recommend you do not modify this file unless you really know what you are doing.
1112
"""
1213

1314
# ruff: noqa: E402
1415

15-
# We need to configure the logging first before we import anything else, so that nothing else imports
16-
# `scrapy.utils.log` before we patch it.
16+
# We need to configure the logging first before we import anything else, so that nothing
17+
# else imports `scrapy.utils.log` before we patch it.
1718
from __future__ import annotations
1819

1920
from logging import StreamHandler, getLogger
@@ -29,9 +30,10 @@
2930
OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
3031
ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
3132

32-
# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
33-
# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
34-
# a specific logger, do it in this file.
33+
# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the
34+
# field is not present in the file, Scrapy will default to `DEBUG`. This setting applies
35+
# to all loggers. If you wish to change the logging level for a specific logger,
36+
# do it in this file.
3537
settings = get_project_settings()
3638
LOGGING_LEVEL = settings['LOG_LEVEL']
3739

@@ -40,7 +42,9 @@
4042
apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
4143

4244

43-
def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
45+
def configure_logger(
46+
logger_name: str | None, log_level: str, *handlers: StreamHandler
47+
) -> None:
4448
"""Configure a logger with the specified settings.
4549
4650
Args:
@@ -56,41 +60,46 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
5660
logger.addHandler(handler)
5761

5862

59-
# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
63+
# Apify loggers have to be set up here and in the `new_configure_logging` as well to be
64+
# able to use them both from
6065
# the `main.py` and Scrapy components.
6166
for logger_name in MAIN_LOGGER_NAMES:
6267
configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
6368

64-
# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
65-
# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
66-
# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
67-
# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
68-
# otherwise we would lose some log messages.
69+
# We can't attach our log handler to the loggers normally, because Scrapy would remove
70+
# them in the `configure_logging` call here:
71+
# https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
72+
# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's
73+
# `configure_logging` method like this, so that our handler is attached right after
74+
# Scrapy calls the `configure_logging` method, because otherwise we would lose some log
75+
# messages.
6976
old_configure_logging = scrapy_logging.configure_logging
7077

7178

7279
def new_configure_logging(*args: Any, **kwargs: Any) -> None:
73-
"""Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
80+
"""Configure logging for Scrapy and root loggers to ensure consistent log behavior.
7481
75-
We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
76-
logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
77-
loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
82+
We need to manually configure both the root logger and all Scrapy-associated loggers.
83+
Configuring only the root logger is not sufficient, as Scrapy will override it with
84+
its own settings. Scrapy uses these four primary loggers:
85+
https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore,
7886
these four loggers and the root logger.
7987
"""
8088
old_configure_logging(*args, **kwargs)
8189

82-
# We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
83-
# property within spiders. See details in the Spider logger property:
90+
# We modify the root (None) logger to ensure proper display of logs from spiders when
91+
# using the `self.logger` property within spiders. See details in the Spider logger
92+
# property:
8493
# https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
8594
configure_logger(None, LOGGING_LEVEL, apify_handler)
8695

87-
# We modify other loggers only by setting up their log level. A custom log handler is added
88-
# only to the root logger to avoid duplicate log messages.
96+
# We modify other loggers only by setting up their log level. A custom log handler
97+
# is added only to the root logger to avoid duplicate log messages.
8998
for logger_name in ALL_LOGGER_NAMES:
9099
configure_logger(logger_name, LOGGING_LEVEL)
91100

92-
# Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
93-
# messages, especially when running on the platform.
101+
# Set the HTTPX logger explicitly to the WARNING level, because it is too verbose
102+
# and spams the logs with useless messages, especially when running on the platform.
94103
configure_logger('httpx', 'WARNING')
95104

96105

@@ -105,12 +114,11 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
105114

106115
from .main import main
107116

108-
# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
109-
# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
110-
# to work together.
111-
#
112-
# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
113-
# on Windows.
117+
# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify)
118+
# asynchronous libraries, it is necessary to set the Twisted reactor to
119+
# `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
120+
# to work together. Note: The reactor must be installed before applying
121+
# `nest_asyncio.apply()`, otherwise, it will not work correctly on Windows.
114122
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
115123
nest_asyncio.apply()
116124

docs/02_guides/code/scrapy_src/items.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
This module defines Scrapy item models for scraped data. Items represent structured data
44
extracted by spiders.
55
6-
For detailed information on creating and utilizing items, refer to the official documentation:
6+
For detailed information on creating and utilizing items,
7+
refer to the official documentation:
78
https://docs.scrapy.org/en/latest/topics/items.html
89
"""
910

docs/02_guides/code/scrapy_src/main.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
"""This module defines the main entry point for the Apify Actor.
22
3-
This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine
4-
processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by
5-
applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline
6-
for pushing data to the Apify dataset.
3+
This module defines the main coroutine for the Apify Scrapy Actor, executed from
4+
the __main__.py file. The coroutine processes the Actor's input and executes the Scrapy
5+
spider. Additionally, it updates Scrapy project settings by applying Apify-related
6+
settings. Which includes adding a custom scheduler, retry middleware, and an item
7+
pipeline for pushing data to the Apify dataset.
78
89
Customization:
910
--------------
1011
11-
Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy
12-
components like spiders and handling Actor input. However, make sure you have a clear understanding of your
13-
modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.
12+
Feel free to customize this file to add specific functionality to the Actor, such
13+
as incorporating your own Scrapy components like spiders and handling Actor input.
14+
However, make sure you have a clear understanding of your modifications. For instance,
15+
removing `apply_apify_settings` break the integration between Scrapy and Apify.
1416
1517
Documentation:
1618
--------------
1719
18-
For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and
19-
other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.
20+
For an in-depth description of the Apify-Scrapy integration process, our Scrapy
21+
components, known limitations and other stuff, please refer to the following
22+
documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.
2023
"""
2124

2225
from __future__ import annotations

docs/02_guides/code/scrapy_src/settings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Scrapy settings module.
22
3-
This module contains Scrapy settings for the project, defining various configurations and options.
3+
This module contains Scrapy settings for the project, defining various configurations
4+
and options.
45
56
For more comprehensive details on Scrapy settings, refer to the official documentation:
67
http://doc.scrapy.org/en/latest/topics/settings.html

docs/02_guides/code/scrapy_src/spiders/title.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ class TitleSpider(Spider):
2020

2121
name = 'title_spider'
2222

23-
# The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
23+
# The `start_urls` specified in this class will be merged with the
24+
# `start_urls` value from your Actor input
2425
# when the project is executed using Apify.
2526
start_urls = ['https://apify.com/']
2627

docs/03_concepts/code/03_rq.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ async def main() -> None:
1919
await queue.add_request(Request.from_url('http://example.com/0'), forefront=True)
2020

2121
# If you try to add an existing request again, it will not do anything
22-
add_request_info = await queue.add_request(Request.from_url('http://different-example.com/5'))
22+
add_request_info = await queue.add_request(
23+
Request.from_url('http://different-example.com/5')
24+
)
2325
Actor.log.info(f'Add request info: {add_request_info}')
2426

2527
processed_request = await queue.get_request(add_request_info.id)
@@ -29,8 +31,8 @@ async def main() -> None:
2931
while not await queue.is_finished():
3032
# Fetch the next unhandled request in the queue
3133
request = await queue.fetch_next_request()
32-
# This can happen due to the eventual consistency of the underlying request queue storage,
33-
# best solution is just to sleep a bit
34+
# This can happen due to the eventual consistency of the underlying request
35+
# queue storage, best solution is just to sleep a bit.
3436
if request is None:
3537
await asyncio.sleep(1)
3638
continue
@@ -45,6 +47,7 @@ async def main() -> None:
4547
Actor.log.info('Request successful.')
4648
await queue.mark_request_as_handled(request)
4749
else:
48-
# If processing the request was unsuccessful, reclaim it so it can be processed again
50+
# If processing the request was unsuccessful, reclaim it so it can be
51+
# processed again.
4952
Actor.log.warning('Request failed, will retry!')
5053
await queue.reclaim_request(request)

docs/03_concepts/code/05_proxy_actor_input.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ async def main() -> None:
55
async with Actor:
66
actor_input = await Actor.get_input() or {}
77
proxy_settings = actor_input.get('proxySettings')
8-
proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings)
8+
proxy_configuration = await Actor.create_proxy_configuration(
9+
actor_proxy_input=proxy_settings
10+
)
911

1012
if not proxy_configuration:
1113
raise RuntimeError('No proxy configuration available.')

0 commit comments

Comments
 (0)