diff --git a/docs/01_overview/code/01_introduction.py b/docs/01_overview/code/01_introduction.py index c5441d61..a3eaba25 100644 --- a/docs/01_overview/code/01_introduction.py +++ b/docs/01_overview/code/01_introduction.py @@ -10,5 +10,8 @@ async def main() -> None: async with httpx.AsyncClient() as client: response = await client.get(actor_input['url']) soup = BeautifulSoup(response.content, 'html.parser') - data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None} + data = { + 'url': actor_input['url'], + 'title': soup.title.string if soup.title else None, + } await Actor.push_data(data) diff --git a/docs/02_guides/code/02_crawlee_beautifulsoup.py b/docs/02_guides/code/02_crawlee_beautifulsoup.py index 489d83ae..e2dba8a1 100644 --- a/docs/02_guides/code/02_crawlee_beautifulsoup.py +++ b/docs/02_guides/code/02_crawlee_beautifulsoup.py @@ -25,7 +25,8 @@ async def main() -> None: # Create a crawler. crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. + # Limit the crawl to max requests. + # Remove or increase it for crawling all links. max_requests_per_crawl=50, ) diff --git a/docs/02_guides/code/02_crawlee_playwright.py b/docs/02_guides/code/02_crawlee_playwright.py index 674c1e94..2f0f110f 100644 --- a/docs/02_guides/code/02_crawlee_playwright.py +++ b/docs/02_guides/code/02_crawlee_playwright.py @@ -25,7 +25,8 @@ async def main() -> None: # Create a crawler. crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. + # Limit the crawl to max requests. + # Remove or increase it for crawling all links. max_requests_per_crawl=50, headless=True, browser_launch_options={ @@ -43,9 +44,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: data = { 'url': context.request.url, 'title': await context.page.title(), - 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()], - 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()], - 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], + 'h1s': [ + await h1.text_content() + for h1 in await context.page.locator('h1').all() + ], + 'h2s': [ + await h2.text_content() + for h2 in await context.page.locator('h2').all() + ], + 'h3s': [ + await h3.text_content() + for h3 in await context.page.locator('h3').all() + ], } # Store the extracted data to the default dataset. diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/02_guides/code/scrapy_project/src/__main__.py index 3dcbf75c..97298fe9 100644 --- a/docs/02_guides/code/scrapy_project/src/__main__.py +++ b/docs/02_guides/code/scrapy_project/src/__main__.py @@ -2,7 +2,8 @@ from twisted.internet import asyncioreactor -# Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components. +# Install Twisted's asyncio reactor before importing any other Twisted or +# Scrapy components. asyncioreactor.install() # type: ignore[no-untyped-call] import os diff --git a/docs/02_guides/code/scrapy_project/src/spiders/title.py b/docs/02_guides/code/scrapy_project/src/spiders/title.py index ed54b3c3..9bb25b34 100644 --- a/docs/02_guides/code/scrapy_project/src/spiders/title.py +++ b/docs/02_guides/code/scrapy_project/src/spiders/title.py @@ -60,7 +60,8 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None title = response.css('title::text').extract_first() yield TitleItem(url=url, title=title) - # Extract all links from the page, create `Request` objects out of them, and yield them. + # Extract all links from the page, create `Request` objects out of them, + # and yield them. for link_href in response.css('a::attr("href")'): link_url = urljoin(response.url, link_href.get()) if link_url.startswith(('http://', 'https://')): diff --git a/docs/03_concepts/code/03_rq.py b/docs/03_concepts/code/03_rq.py index ba6a9570..fe1ea605 100644 --- a/docs/03_concepts/code/03_rq.py +++ b/docs/03_concepts/code/03_rq.py @@ -19,7 +19,9 @@ async def main() -> None: await queue.add_request(Request.from_url('http://example.com/0'), forefront=True) # If you try to add an existing request again, it will not do anything - add_request_info = await queue.add_request(Request.from_url('http://different-example.com/5')) + add_request_info = await queue.add_request( + Request.from_url('http://different-example.com/5') + ) Actor.log.info(f'Add request info: {add_request_info}') processed_request = await queue.get_request(add_request_info.id) @@ -29,8 +31,8 @@ async def main() -> None: while not await queue.is_finished(): # Fetch the next unhandled request in the queue request = await queue.fetch_next_request() - # This can happen due to the eventual consistency of the underlying request queue storage, - # best solution is just to sleep a bit + # This can happen due to the eventual consistency of the underlying request + # queue storage, best solution is just to sleep a bit. if request is None: await asyncio.sleep(1) continue @@ -45,6 +47,7 @@ async def main() -> None: Actor.log.info('Request successful.') await queue.mark_request_as_handled(request) else: - # If processing the request was unsuccessful, reclaim it so it can be processed again + # If processing the request was unsuccessful, reclaim it so it can be + # processed again. Actor.log.warning('Request failed, will retry!') await queue.reclaim_request(request) diff --git a/docs/03_concepts/code/05_proxy_actor_input.py b/docs/03_concepts/code/05_proxy_actor_input.py index 3a69ea0a..3ca0344d 100644 --- a/docs/03_concepts/code/05_proxy_actor_input.py +++ b/docs/03_concepts/code/05_proxy_actor_input.py @@ -5,7 +5,9 @@ async def main() -> None: async with Actor: actor_input = await Actor.get_input() or {} proxy_settings = actor_input.get('proxySettings') - proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings) + proxy_configuration = await Actor.create_proxy_configuration( + actor_proxy_input=proxy_settings + ) if not proxy_configuration: raise RuntimeError('No proxy configuration available.') diff --git a/docs/03_concepts/code/05_proxy_rotation.py b/docs/03_concepts/code/05_proxy_rotation.py index c816dabf..8e6a5de0 100644 --- a/docs/03_concepts/code/05_proxy_rotation.py +++ b/docs/03_concepts/code/05_proxy_rotation.py @@ -17,7 +17,15 @@ async def main() -> None: proxy_url = await proxy_configuration.new_url() # http://proxy-2.com proxy_url = await proxy_configuration.new_url() # http://proxy-1.com proxy_url = await proxy_configuration.new_url() # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com - proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com + proxy_url = await proxy_configuration.new_url( + session_id='a' + ) # http://proxy-1.com + proxy_url = await proxy_configuration.new_url( + session_id='b' + ) # http://proxy-2.com + proxy_url = await proxy_configuration.new_url( + session_id='b' + ) # http://proxy-2.com + proxy_url = await proxy_configuration.new_url( + session_id='a' + ) # http://proxy-1.com diff --git a/docs/03_concepts/code/09_webserver.py b/docs/03_concepts/code/09_webserver.py index de6d953d..48a5c10d 100644 --- a/docs/03_concepts/code/09_webserver.py +++ b/docs/03_concepts/code/09_webserver.py @@ -21,7 +21,9 @@ def run_server() -> None: # Start the HTTP server on the provided port, # and save a reference to the server. global http_server - with ThreadingHTTPServer(('', Actor.config.web_server_port), RequestHandler) as server: + with ThreadingHTTPServer( + ('', Actor.config.web_server_port), RequestHandler + ) as server: Actor.log.info(f'Server running on {Actor.config.web_server_port}') http_server = server server.serve_forever() diff --git a/docs/pyproject.toml b/docs/pyproject.toml new file mode 100644 index 00000000..73a75678 --- /dev/null +++ b/docs/pyproject.toml @@ -0,0 +1,9 @@ +# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated +# documentation webpages are shown without vertical slider to make them more readable. + +[tool.ruff] +# Inherit all from project top configuration file. +extend = "../pyproject.toml" + +# Override just line length +line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.