Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/01_overview/code/01_introduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ async def main() -> None:
async with httpx.AsyncClient() as client:
response = await client.get(actor_input['url'])
soup = BeautifulSoup(response.content, 'html.parser')
data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None}
data = {
'url': actor_input['url'],
'title': soup.title.string if soup.title else None,
}
await Actor.push_data(data)
3 changes: 2 additions & 1 deletion docs/02_guides/code/02_crawlee_beautifulsoup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ async def main() -> None:

# Create a crawler.
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
# Limit the crawl to max requests.
# Remove or increase it for crawling all links.
max_requests_per_crawl=50,
)

Expand Down
18 changes: 14 additions & 4 deletions docs/02_guides/code/02_crawlee_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ async def main() -> None:

# Create a crawler.
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
# Limit the crawl to max requests.
# Remove or increase it for crawling all links.
max_requests_per_crawl=50,
headless=True,
browser_launch_options={
Expand All @@ -43,9 +44,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
data = {
'url': context.request.url,
'title': await context.page.title(),
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
'h1s': [
await h1.text_content()
for h1 in await context.page.locator('h1').all()
],
'h2s': [
await h2.text_content()
for h2 in await context.page.locator('h2').all()
],
'h3s': [
await h3.text_content()
for h3 in await context.page.locator('h3').all()
],
}

# Store the extracted data to the default dataset.
Expand Down
3 changes: 2 additions & 1 deletion docs/02_guides/code/scrapy_project/src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from twisted.internet import asyncioreactor

# Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components.
# Install Twisted's asyncio reactor before importing any other Twisted or
# Scrapy components.
asyncioreactor.install() # type: ignore[no-untyped-call]

import os
Expand Down
3 changes: 2 additions & 1 deletion docs/02_guides/code/scrapy_project/src/spiders/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
title = response.css('title::text').extract_first()
yield TitleItem(url=url, title=title)

# Extract all links from the page, create `Request` objects out of them, and yield them.
# Extract all links from the page, create `Request` objects out of them,
# and yield them.
for link_href in response.css('a::attr("href")'):
link_url = urljoin(response.url, link_href.get())
if link_url.startswith(('http://', 'https://')):
Expand Down
11 changes: 7 additions & 4 deletions docs/03_concepts/code/03_rq.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ async def main() -> None:
await queue.add_request(Request.from_url('http://example.com/0'), forefront=True)

# If you try to add an existing request again, it will not do anything
add_request_info = await queue.add_request(Request.from_url('http://different-example.com/5'))
add_request_info = await queue.add_request(
Request.from_url('http://different-example.com/5')
)
Actor.log.info(f'Add request info: {add_request_info}')

processed_request = await queue.get_request(add_request_info.id)
Expand All @@ -29,8 +31,8 @@ async def main() -> None:
while not await queue.is_finished():
# Fetch the next unhandled request in the queue
request = await queue.fetch_next_request()
# This can happen due to the eventual consistency of the underlying request queue storage,
# best solution is just to sleep a bit
# This can happen due to the eventual consistency of the underlying request
# queue storage, best solution is just to sleep a bit.
if request is None:
await asyncio.sleep(1)
continue
Expand All @@ -45,6 +47,7 @@ async def main() -> None:
Actor.log.info('Request successful.')
await queue.mark_request_as_handled(request)
else:
# If processing the request was unsuccessful, reclaim it so it can be processed again
# If processing the request was unsuccessful, reclaim it so it can be
# processed again.
Actor.log.warning('Request failed, will retry!')
await queue.reclaim_request(request)
4 changes: 3 additions & 1 deletion docs/03_concepts/code/05_proxy_actor_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ async def main() -> None:
async with Actor:
actor_input = await Actor.get_input() or {}
proxy_settings = actor_input.get('proxySettings')
proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings)
proxy_configuration = await Actor.create_proxy_configuration(
actor_proxy_input=proxy_settings
)

if not proxy_configuration:
raise RuntimeError('No proxy configuration available.')
Expand Down
16 changes: 12 additions & 4 deletions docs/03_concepts/code/05_proxy_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@ async def main() -> None:
proxy_url = await proxy_configuration.new_url() # http://proxy-2.com
proxy_url = await proxy_configuration.new_url() # http://proxy-1.com
proxy_url = await proxy_configuration.new_url() # http://proxy-2.com
proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com
proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com
proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com
proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com
proxy_url = await proxy_configuration.new_url(
session_id='a'
) # http://proxy-1.com
proxy_url = await proxy_configuration.new_url(
session_id='b'
) # http://proxy-2.com
proxy_url = await proxy_configuration.new_url(
session_id='b'
) # http://proxy-2.com
proxy_url = await proxy_configuration.new_url(
session_id='a'
) # http://proxy-1.com
4 changes: 3 additions & 1 deletion docs/03_concepts/code/09_webserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ def run_server() -> None:
# Start the HTTP server on the provided port,
# and save a reference to the server.
global http_server
with ThreadingHTTPServer(('', Actor.config.web_server_port), RequestHandler) as server:
with ThreadingHTTPServer(
('', Actor.config.web_server_port), RequestHandler
) as server:
Actor.log.info(f'Server running on {Actor.config.web_server_port}')
http_server = server
server.serve_forever()
Expand Down
9 changes: 9 additions & 0 deletions docs/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated
# documentation webpages are shown without vertical slider to make them more readable.

[tool.ruff]
# Inherit all from project top configuration file.
extend = "../pyproject.toml"

# Override just line length
line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.