address the feedback

vdusek · vdusek · commit 704b76d8cd1f · 2025-09-03T16:25:01.000+02:00
diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx
@@ -35,7 +35,7 @@ The [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) works
 
 ## Actor with PlaywrightCrawler
 
-The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor.
+The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content rendering. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor.
 
 <CodeBlock className="language-python">
     {CrawleePlaywrightExample}
diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py
@@ -1,3 +1,4 @@
+import asyncio
 from urllib.parse import urljoin
 
 import httpx
@@ -81,3 +82,7 @@ async def main() -> None:
                 finally:
                     # Mark the request as handled to ensure it is not processed again.
                     await request_queue.mark_request_as_handled(new_request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py
@@ -1,3 +1,4 @@
+import asyncio
 from urllib.parse import urljoin
 
 import impit
@@ -87,3 +88,7 @@ async def main() -> None:
                 finally:
                     # Mark the request as handled to ensure it is not processed again.
                     await request_queue.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py
@@ -1,3 +1,4 @@
+import asyncio
 from urllib.parse import urljoin
 
 from playwright.async_api import async_playwright
@@ -90,3 +91,7 @@ async def main() -> None:
                     await page.close()
                     # Mark the request as handled to ensure it is not processed again.
                     await request_queue.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py
@@ -100,3 +100,7 @@ async def main() -> None:
                 await request_queue.mark_request_as_handled(request)
 
         driver.quit()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py
@@ -1,7 +1,36 @@
+import asyncio
+
 from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 from apify import Actor
 
+# Create a crawler.
+crawler = BeautifulSoupCrawler(
+    # Limit the crawl to max requests. Remove or increase it for crawling all links.
+    max_requests_per_crawl=50,
+)
+
+
+# Define a request handler, which will be called for every request.
+@crawler.router.default_handler
+async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+    Actor.log.info(f'Scraping {context.request.url}...')
+
+    # Extract the desired data.
+    data = {
+        'url': context.request.url,
+        'title': context.soup.title.string if context.soup.title else None,
+        'h1s': [h1.text for h1 in context.soup.find_all('h1')],
+        'h2s': [h2.text for h2 in context.soup.find_all('h2')],
+        'h3s': [h3.text for h3 in context.soup.find_all('h3')],
+    }
+
+    # Store the extracted data to the default dataset.
+    await context.push_data(data)
+
+    # Enqueue additional links found on the current page.
+    await context.enqueue_links(strategy='same-domain')
+
 
 async def main() -> None:
     # Enter the context of the Actor.
@@ -10,44 +39,17 @@ async def main() -> None:
         actor_input = await Actor.get_input() or {}
         start_urls = [
             url.get('url')
-            for url in actor_input.get(
-                'start_urls',
-                [{'url': 'https://apify.com'}],
-            )
+            for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
         ]
 
         # Exit if no start URLs are provided.
         if not start_urls:
             Actor.log.info('No start URLs specified in Actor input, exiting...')
             await Actor.exit()
 
-        # Create a crawler.
-        crawler = BeautifulSoupCrawler(
-            # Limit the crawl to max requests.
-            # Remove or increase it for crawling all links.
-            max_requests_per_crawl=50,
-        )
-
-        # Define a request handler, which will be called for every request.
-        @crawler.router.default_handler
-        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
-            url = context.request.url
-            Actor.log.info(f'Scraping {url}...')
-
-            # Extract the desired data.
-            data = {
-                'url': context.request.url,
-                'title': context.soup.title.string if context.soup.title else None,
-                'h1s': [h1.text for h1 in context.soup.find_all('h1')],
-                'h2s': [h2.text for h2 in context.soup.find_all('h2')],
-                'h3s': [h3.text for h3 in context.soup.find_all('h3')],
-            }
-
-            # Store the extracted data to the default dataset.
-            await context.push_data(data)
-
-            # Enqueue additional links found on the current page.
-            await context.enqueue_links()
-
         # Run the crawler with the starting requests.
         await crawler.run(start_urls)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py
@@ -1,7 +1,36 @@
+import asyncio
+
 from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 
 from apify import Actor
 
+# Create a crawler.
+crawler = ParselCrawler(
+    # Limit the crawl to max requests. Remove or increase it for crawling all links.
+    max_requests_per_crawl=50,
+)
+
+
+# Define a request handler, which will be called for every request.
+@crawler.router.default_handler
+async def request_handler(context: ParselCrawlingContext) -> None:
+    Actor.log.info(f'Scraping {context.request.url}...')
+
+    # Extract the desired data.
+    data = {
+        'url': context.request.url,
+        'title': context.selector.xpath('//title/text()').get(),
+        'h1s': context.selector.xpath('//h1/text()').getall(),
+        'h2s': context.selector.xpath('//h2/text()').getall(),
+        'h3s': context.selector.xpath('//h3/text()').getall(),
+    }
+
+    # Store the extracted data to the default dataset.
+    await context.push_data(data)
+
+    # Enqueue additional links found on the current page.
+    await context.enqueue_links(strategy='same-domain')
+
 
 async def main() -> None:
     # Enter the context of the Actor.
@@ -10,44 +39,17 @@ async def main() -> None:
         actor_input = await Actor.get_input() or {}
         start_urls = [
             url.get('url')
-            for url in actor_input.get(
-                'start_urls',
-                [{'url': 'https://apify.com'}],
-            )
+            for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
         ]
 
         # Exit if no start URLs are provided.
         if not start_urls:
             Actor.log.info('No start URLs specified in Actor input, exiting...')
             await Actor.exit()
 
-        # Create a crawler.
-        crawler = ParselCrawler(
-            # Limit the crawl to max requests.
-            # Remove or increase it for crawling all links.
-            max_requests_per_crawl=50,
-        )
-
-        # Define a request handler, which will be called for every request.
-        @crawler.router.default_handler
-        async def request_handler(context: ParselCrawlingContext) -> None:
-            url = context.request.url
-            Actor.log.info(f'Scraping {url}...')
-
-            # Extract the desired data.
-            data = {
-                'url': context.request.url,
-                'title': context.selector.xpath('//title/text()').get(),
-                'h1s': context.selector.xpath('//h1/text()').getall(),
-                'h2s': context.selector.xpath('//h2/text()').getall(),
-                'h3s': context.selector.xpath('//h3/text()').getall(),
-            }
-
-            # Store the extracted data to the default dataset.
-            await context.push_data(data)
-
-            # Enqueue additional links found on the current page.
-            await context.enqueue_links()
-
         # Run the crawler with the starting requests.
         await crawler.run(start_urls)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py
@@ -1,7 +1,39 @@
+import asyncio
+
 from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
 from apify import Actor
 
+# Create a crawler.
+crawler = PlaywrightCrawler(
+    # Limit the crawl to max requests. Remove or increase it for crawling all links.
+    max_requests_per_crawl=50,
+    # Run the browser in a headless mode.
+    headless=True,
+    browser_launch_options={'args': ['--disable-gpu']},
+)
+
+
+# Define a request handler, which will be called for every request.
+@crawler.router.default_handler
+async def request_handler(context: PlaywrightCrawlingContext) -> None:
+    Actor.log.info(f'Scraping {context.request.url}...')
+
+    # Extract the desired data.
+    data = {
+        'url': context.request.url,
+        'title': await context.page.title(),
+        'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
+        'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
+        'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
+    }
+
+    # Store the extracted data to the default dataset.
+    await context.push_data(data)
+
+    # Enqueue additional links found on the current page.
+    await context.enqueue_links(strategy='same-domain')
+
 
 async def main() -> None:
     # Enter the context of the Actor.
@@ -10,57 +42,17 @@ async def main() -> None:
         actor_input = await Actor.get_input() or {}
         start_urls = [
             url.get('url')
-            for url in actor_input.get(
-                'start_urls',
-                [{'url': 'https://apify.com'}],
-            )
+            for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
         ]
 
         # Exit if no start URLs are provided.
         if not start_urls:
             Actor.log.info('No start URLs specified in Actor input, exiting...')
             await Actor.exit()
 
-        # Create a crawler.
-        crawler = PlaywrightCrawler(
-            # Limit the crawl to max requests.
-            # Remove or increase it for crawling all links.
-            max_requests_per_crawl=50,
-            headless=True,
-            browser_launch_options={
-                'args': ['--disable-gpu'],
-            },
-        )
-
-        # Define a request handler, which will be called for every request.
-        @crawler.router.default_handler
-        async def request_handler(context: PlaywrightCrawlingContext) -> None:
-            url = context.request.url
-            Actor.log.info(f'Scraping {url}...')
-
-            # Extract the desired data.
-            data = {
-                'url': context.request.url,
-                'title': await context.page.title(),
-                'h1s': [
-                    await h1.text_content()
-                    for h1 in await context.page.locator('h1').all()
-                ],
-                'h2s': [
-                    await h2.text_content()
-                    for h2 in await context.page.locator('h2').all()
-                ],
-                'h3s': [
-                    await h3.text_content()
-                    for h3 in await context.page.locator('h3').all()
-                ],
-            }
-
-            # Store the extracted data to the default dataset.
-            await context.push_data(data)
-
-            # Enqueue additional links found on the current page.
-            await context.enqueue_links()
-
         # Run the crawler with the starting requests.
         await crawler.run(start_urls)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())