diff --git a/docs/03_concepts/01_actor_lifecycle.mdx b/docs/02_concepts/01_actor_lifecycle.mdx similarity index 100% rename from docs/03_concepts/01_actor_lifecycle.mdx rename to docs/02_concepts/01_actor_lifecycle.mdx diff --git a/docs/03_concepts/02_actor_input.mdx b/docs/02_concepts/02_actor_input.mdx similarity index 100% rename from docs/03_concepts/02_actor_input.mdx rename to docs/02_concepts/02_actor_input.mdx diff --git a/docs/03_concepts/03_storages.mdx b/docs/02_concepts/03_storages.mdx similarity index 100% rename from docs/03_concepts/03_storages.mdx rename to docs/02_concepts/03_storages.mdx diff --git a/docs/03_concepts/04_actor_events.mdx b/docs/02_concepts/04_actor_events.mdx similarity index 100% rename from docs/03_concepts/04_actor_events.mdx rename to docs/02_concepts/04_actor_events.mdx diff --git a/docs/03_concepts/05_proxy_management.mdx b/docs/02_concepts/05_proxy_management.mdx similarity index 100% rename from docs/03_concepts/05_proxy_management.mdx rename to docs/02_concepts/05_proxy_management.mdx diff --git a/docs/03_concepts/06_interacting_with_other_actors.mdx b/docs/02_concepts/06_interacting_with_other_actors.mdx similarity index 100% rename from docs/03_concepts/06_interacting_with_other_actors.mdx rename to docs/02_concepts/06_interacting_with_other_actors.mdx diff --git a/docs/03_concepts/07_webhooks.mdx b/docs/02_concepts/07_webhooks.mdx similarity index 100% rename from docs/03_concepts/07_webhooks.mdx rename to docs/02_concepts/07_webhooks.mdx diff --git a/docs/03_concepts/08_access_apify_api.mdx b/docs/02_concepts/08_access_apify_api.mdx similarity index 100% rename from docs/03_concepts/08_access_apify_api.mdx rename to docs/02_concepts/08_access_apify_api.mdx diff --git a/docs/03_concepts/09_running_webserver.mdx b/docs/02_concepts/09_running_webserver.mdx similarity index 100% rename from docs/03_concepts/09_running_webserver.mdx rename to docs/02_concepts/09_running_webserver.mdx diff --git a/docs/03_concepts/10_logging.mdx b/docs/02_concepts/10_logging.mdx similarity index 100% rename from docs/03_concepts/10_logging.mdx rename to docs/02_concepts/10_logging.mdx diff --git a/docs/03_concepts/11_configuration.mdx b/docs/02_concepts/11_configuration.mdx similarity index 100% rename from docs/03_concepts/11_configuration.mdx rename to docs/02_concepts/11_configuration.mdx diff --git a/docs/03_concepts/12_pay_per_event.mdx b/docs/02_concepts/12_pay_per_event.mdx similarity index 100% rename from docs/03_concepts/12_pay_per_event.mdx rename to docs/02_concepts/12_pay_per_event.mdx diff --git a/docs/03_concepts/code/01_context_manager.py b/docs/02_concepts/code/01_context_manager.py similarity index 100% rename from docs/03_concepts/code/01_context_manager.py rename to docs/02_concepts/code/01_context_manager.py diff --git a/docs/03_concepts/code/01_init_exit.py b/docs/02_concepts/code/01_init_exit.py similarity index 100% rename from docs/03_concepts/code/01_init_exit.py rename to docs/02_concepts/code/01_init_exit.py diff --git a/docs/03_concepts/code/01_reboot.py b/docs/02_concepts/code/01_reboot.py similarity index 100% rename from docs/03_concepts/code/01_reboot.py rename to docs/02_concepts/code/01_reboot.py diff --git a/docs/03_concepts/code/01_status_message.py b/docs/02_concepts/code/01_status_message.py similarity index 100% rename from docs/03_concepts/code/01_status_message.py rename to docs/02_concepts/code/01_status_message.py diff --git a/docs/03_concepts/code/02_input.py b/docs/02_concepts/code/02_input.py similarity index 100% rename from docs/03_concepts/code/02_input.py rename to docs/02_concepts/code/02_input.py diff --git a/docs/03_concepts/code/03_dataset_exports.py b/docs/02_concepts/code/03_dataset_exports.py similarity index 100% rename from docs/03_concepts/code/03_dataset_exports.py rename to docs/02_concepts/code/03_dataset_exports.py diff --git a/docs/03_concepts/code/03_dataset_read_write.py b/docs/02_concepts/code/03_dataset_read_write.py similarity index 100% rename from docs/03_concepts/code/03_dataset_read_write.py rename to docs/02_concepts/code/03_dataset_read_write.py diff --git a/docs/03_concepts/code/03_deleting_storages.py b/docs/02_concepts/code/03_deleting_storages.py similarity index 100% rename from docs/03_concepts/code/03_deleting_storages.py rename to docs/02_concepts/code/03_deleting_storages.py diff --git a/docs/03_concepts/code/03_kvs_iterating.py b/docs/02_concepts/code/03_kvs_iterating.py similarity index 100% rename from docs/03_concepts/code/03_kvs_iterating.py rename to docs/02_concepts/code/03_kvs_iterating.py diff --git a/docs/03_concepts/code/03_kvs_public_url.py b/docs/02_concepts/code/03_kvs_public_url.py similarity index 100% rename from docs/03_concepts/code/03_kvs_public_url.py rename to docs/02_concepts/code/03_kvs_public_url.py diff --git a/docs/03_concepts/code/03_kvs_read_write.py b/docs/02_concepts/code/03_kvs_read_write.py similarity index 100% rename from docs/03_concepts/code/03_kvs_read_write.py rename to docs/02_concepts/code/03_kvs_read_write.py diff --git a/docs/03_concepts/code/03_opening_storages.py b/docs/02_concepts/code/03_opening_storages.py similarity index 100% rename from docs/03_concepts/code/03_opening_storages.py rename to docs/02_concepts/code/03_opening_storages.py diff --git a/docs/03_concepts/code/03_rq.py b/docs/02_concepts/code/03_rq.py similarity index 100% rename from docs/03_concepts/code/03_rq.py rename to docs/02_concepts/code/03_rq.py diff --git a/docs/03_concepts/code/04_actor_events.py b/docs/02_concepts/code/04_actor_events.py similarity index 100% rename from docs/03_concepts/code/04_actor_events.py rename to docs/02_concepts/code/04_actor_events.py diff --git a/docs/03_concepts/code/05_apify_proxy.py b/docs/02_concepts/code/05_apify_proxy.py similarity index 100% rename from docs/03_concepts/code/05_apify_proxy.py rename to docs/02_concepts/code/05_apify_proxy.py diff --git a/docs/03_concepts/code/05_apify_proxy_config.py b/docs/02_concepts/code/05_apify_proxy_config.py similarity index 100% rename from docs/03_concepts/code/05_apify_proxy_config.py rename to docs/02_concepts/code/05_apify_proxy_config.py diff --git a/docs/03_concepts/code/05_custom_proxy.py b/docs/02_concepts/code/05_custom_proxy.py similarity index 100% rename from docs/03_concepts/code/05_custom_proxy.py rename to docs/02_concepts/code/05_custom_proxy.py diff --git a/docs/03_concepts/code/05_custom_proxy_function.py b/docs/02_concepts/code/05_custom_proxy_function.py similarity index 100% rename from docs/03_concepts/code/05_custom_proxy_function.py rename to docs/02_concepts/code/05_custom_proxy_function.py diff --git a/docs/03_concepts/code/05_proxy_actor_input.py b/docs/02_concepts/code/05_proxy_actor_input.py similarity index 100% rename from docs/03_concepts/code/05_proxy_actor_input.py rename to docs/02_concepts/code/05_proxy_actor_input.py diff --git a/docs/03_concepts/code/05_proxy_httpx.py b/docs/02_concepts/code/05_proxy_httpx.py similarity index 100% rename from docs/03_concepts/code/05_proxy_httpx.py rename to docs/02_concepts/code/05_proxy_httpx.py diff --git a/docs/03_concepts/code/05_proxy_rotation.py b/docs/02_concepts/code/05_proxy_rotation.py similarity index 100% rename from docs/03_concepts/code/05_proxy_rotation.py rename to docs/02_concepts/code/05_proxy_rotation.py diff --git a/docs/03_concepts/code/06_interacting_call.py b/docs/02_concepts/code/06_interacting_call.py similarity index 100% rename from docs/03_concepts/code/06_interacting_call.py rename to docs/02_concepts/code/06_interacting_call.py diff --git a/docs/03_concepts/code/06_interacting_call_task.py b/docs/02_concepts/code/06_interacting_call_task.py similarity index 100% rename from docs/03_concepts/code/06_interacting_call_task.py rename to docs/02_concepts/code/06_interacting_call_task.py diff --git a/docs/03_concepts/code/06_interacting_metamorph.py b/docs/02_concepts/code/06_interacting_metamorph.py similarity index 100% rename from docs/03_concepts/code/06_interacting_metamorph.py rename to docs/02_concepts/code/06_interacting_metamorph.py diff --git a/docs/03_concepts/code/06_interacting_start.py b/docs/02_concepts/code/06_interacting_start.py similarity index 100% rename from docs/03_concepts/code/06_interacting_start.py rename to docs/02_concepts/code/06_interacting_start.py diff --git a/docs/03_concepts/code/07_webhook.py b/docs/02_concepts/code/07_webhook.py similarity index 100% rename from docs/03_concepts/code/07_webhook.py rename to docs/02_concepts/code/07_webhook.py diff --git a/docs/03_concepts/code/07_webhook_preventing.py b/docs/02_concepts/code/07_webhook_preventing.py similarity index 100% rename from docs/03_concepts/code/07_webhook_preventing.py rename to docs/02_concepts/code/07_webhook_preventing.py diff --git a/docs/03_concepts/code/08_actor_client.py b/docs/02_concepts/code/08_actor_client.py similarity index 100% rename from docs/03_concepts/code/08_actor_client.py rename to docs/02_concepts/code/08_actor_client.py diff --git a/docs/03_concepts/code/08_actor_new_client.py b/docs/02_concepts/code/08_actor_new_client.py similarity index 100% rename from docs/03_concepts/code/08_actor_new_client.py rename to docs/02_concepts/code/08_actor_new_client.py diff --git a/docs/03_concepts/code/09_webserver.py b/docs/02_concepts/code/09_webserver.py similarity index 100% rename from docs/03_concepts/code/09_webserver.py rename to docs/02_concepts/code/09_webserver.py diff --git a/docs/03_concepts/code/10_log_config.py b/docs/02_concepts/code/10_log_config.py similarity index 100% rename from docs/03_concepts/code/10_log_config.py rename to docs/02_concepts/code/10_log_config.py diff --git a/docs/03_concepts/code/10_logger_usage.py b/docs/02_concepts/code/10_logger_usage.py similarity index 100% rename from docs/03_concepts/code/10_logger_usage.py rename to docs/02_concepts/code/10_logger_usage.py diff --git a/docs/03_concepts/code/10_redirect_log.py b/docs/02_concepts/code/10_redirect_log.py similarity index 100% rename from docs/03_concepts/code/10_redirect_log.py rename to docs/02_concepts/code/10_redirect_log.py diff --git a/docs/03_concepts/code/10_redirect_log_existing_run.py b/docs/02_concepts/code/10_redirect_log_existing_run.py similarity index 100% rename from docs/03_concepts/code/10_redirect_log_existing_run.py rename to docs/02_concepts/code/10_redirect_log_existing_run.py diff --git a/docs/03_concepts/code/11_config.py b/docs/02_concepts/code/11_config.py similarity index 100% rename from docs/03_concepts/code/11_config.py rename to docs/02_concepts/code/11_config.py diff --git a/docs/03_concepts/code/actor_charge.py b/docs/02_concepts/code/actor_charge.py similarity index 100% rename from docs/03_concepts/code/actor_charge.py rename to docs/02_concepts/code/actor_charge.py diff --git a/docs/03_concepts/code/conditional_actor_charge.py b/docs/02_concepts/code/conditional_actor_charge.py similarity index 100% rename from docs/03_concepts/code/conditional_actor_charge.py rename to docs/02_concepts/code/conditional_actor_charge.py diff --git a/docs/02_guides/01_beautifulsoup_httpx.mdx b/docs/02_guides/01_beautifulsoup_httpx.mdx deleted file mode 100644 index 4ecabd6e..00000000 --- a/docs/02_guides/01_beautifulsoup_httpx.mdx +++ /dev/null @@ -1,30 +0,0 @@ ---- -id: beautifulsoup-httpx -title: Using BeautifulSoup with HTTPX ---- - -import CodeBlock from '@theme/CodeBlock'; - -import BeautifulSoupHttpxExample from '!!raw-loader!./code/01_beautifulsoup_httpx.py'; - -In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library with the [HTTPX](https://www.python-httpx.org/) library in your Apify Actors. - -## Introduction - -`BeautifulSoup` is a Python library for extracting data from HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, enabling efficient data extraction. - -`HTTPX` is a modern, high-level HTTP client library for Python. It provides a simple interface for making HTTP requests and supports both synchronous and asynchronous requests. - -To create an `Actor` which uses those libraries, start from the [BeautifulSoup & Python](https://apify.com/templates/categories/python) Actor template. This template includes the `BeautifulSoup` and `HTTPX` libraries preinstalled, allowing you to begin development immediately. - -## Example Actor - -Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses `HTTPX` for fetching pages and `BeautifulSoup` for parsing their content to extract titles and links to other pages. - - - {BeautifulSoupHttpxExample} - - -## Conclusion - -In this guide, you learned how to use the `BeautifulSoup` with the `HTTPX` in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/02_crawlee.mdx b/docs/02_guides/02_crawlee.mdx deleted file mode 100644 index b040cad2..00000000 --- a/docs/02_guides/02_crawlee.mdx +++ /dev/null @@ -1,37 +0,0 @@ ---- -id: crawlee -title: Using Crawlee ---- - -import CodeBlock from '@theme/CodeBlock'; - -import CrawleeBeautifulSoupExample from '!!raw-loader!./code/02_crawlee_beautifulsoup.py'; -import CrawleePlaywrightExample from '!!raw-loader!./code/02_crawlee_playwright.py'; - -In this guide you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. - -## Introduction - -`Crawlee` is a Python library for web scraping and browser automation that provides a robust and flexible framework for building web scraping tasks. It seamlessly integrates with the Apify platform and supports a variety of scraping techniques, from static HTML parsing to dynamic JavaScript-rendered content handling. Crawlee offers a range of crawlers, including HTTP-based crawlers like [`HttpCrawler`](https://crawlee.dev/python/api/class/HttpCrawler), [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) and [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and browser-based crawlers like [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler), to suit different scraping needs. - -In this guide, you'll learn how to use Crawlee with `BeautifulSoupCrawler` and `PlaywrightCrawler` to build Apify Actors for web scraping. - -## Actor with BeautifulSoupCrawler - -The `BeautifulSoupCrawler` is ideal for extracting data from static HTML pages. It uses `BeautifulSoup` for parsing and [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication, ensuring efficient and lightweight scraping. If you do not need to execute JavaScript on the page, `BeautifulSoupCrawler` is a great choice for your scraping tasks. Below is an example of how to use `BeautifulSoupCrawler` in an Apify Actor. - - - {CrawleeBeautifulSoupExample} - - -## Actor with PlaywrightCrawler - -The `PlaywrightCrawler` is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use `PlaywrightCrawler` in an Apify Actor. - - - {CrawleePlaywrightExample} - - -## Conclusion - -In this guide, you learned how to use the `Crawlee` library in your Apify Actors. By using the `BeautifulSoupCrawler` and `PlaywrightCrawler` crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/code/02_crawlee_beautifulsoup.py b/docs/02_guides/code/02_crawlee_beautifulsoup.py deleted file mode 100644 index e2dba8a1..00000000 --- a/docs/02_guides/code/02_crawlee_beautifulsoup.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import annotations - -from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext - -from apify import Actor - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = [ - url.get('url') - for url in actor_input.get( - 'start_urls', - [{'url': 'https://apify.com'}], - ) - ] - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a crawler. - crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. - # Remove or increase it for crawling all links. - max_requests_per_crawl=50, - ) - - # Define a request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: BeautifulSoupCrawlingContext) -> None: - url = context.request.url - Actor.log.info(f'Scraping {url}...') - - # Extract the desired data. - data = { - 'url': context.request.url, - 'title': context.soup.title.string if context.soup.title else None, - 'h1s': [h1.text for h1 in context.soup.find_all('h1')], - 'h2s': [h2.text for h2 in context.soup.find_all('h2')], - 'h3s': [h3.text for h3 in context.soup.find_all('h3')], - } - - # Store the extracted data to the default dataset. - await context.push_data(data) - - # Enqueue additional links found on the current page. - await context.enqueue_links() - - # Run the crawler with the starting requests. - await crawler.run(start_urls) diff --git a/docs/02_guides/code/02_crawlee_playwright.py b/docs/02_guides/code/02_crawlee_playwright.py deleted file mode 100644 index 2f0f110f..00000000 --- a/docs/02_guides/code/02_crawlee_playwright.py +++ /dev/null @@ -1,68 +0,0 @@ -from __future__ import annotations - -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -from apify import Actor - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = [ - url.get('url') - for url in actor_input.get( - 'start_urls', - [{'url': 'https://apify.com'}], - ) - ] - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a crawler. - crawler = PlaywrightCrawler( - # Limit the crawl to max requests. - # Remove or increase it for crawling all links. - max_requests_per_crawl=50, - headless=True, - browser_launch_options={ - 'args': ['--disable-gpu'], - }, - ) - - # Define a request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - url = context.request.url - Actor.log.info(f'Scraping {url}...') - - # Extract the desired data. - data = { - 'url': context.request.url, - 'title': await context.page.title(), - 'h1s': [ - await h1.text_content() - for h1 in await context.page.locator('h1').all() - ], - 'h2s': [ - await h2.text_content() - for h2 in await context.page.locator('h2').all() - ], - 'h3s': [ - await h3.text_content() - for h3 in await context.page.locator('h3').all() - ], - } - - # Store the extracted data to the default dataset. - await context.push_data(data) - - # Enqueue additional links found on the current page. - await context.enqueue_links() - - # Run the crawler with the starting requests. - await crawler.run(start_urls) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx new file mode 100644 index 00000000..b6a69c01 --- /dev/null +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -0,0 +1,30 @@ +--- +id: beautifulsoup-httpx +title: Using BeautifulSoup with HTTPX +--- + +import CodeBlock from '@theme/CodeBlock'; + +import BeautifulSoupHttpxExample from '!!raw-loader!./code/01_beautifulsoup_httpx.py'; + +In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library with the [HTTPX](https://www.python-httpx.org/) library in your Apify Actors. + +## Introduction + +[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) is a Python library for extracting data from HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, enabling efficient data extraction. + +[HTTPX](https://www.python-httpx.org/) is a modern, high-level HTTP client library for Python. It provides a simple interface for making HTTP requests and supports both synchronous and asynchronous requests. + +To create an Actor which uses those libraries, start from the [BeautifulSoup & Python](https://apify.com/templates/categories/python) Actor template. This template includes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) and [HTTPX](https://www.python-httpx.org/) libraries preinstalled, allowing you to begin development immediately. + +## Example Actor + +Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract titles and links to other pages. + + + {BeautifulSoupHttpxExample} + + +## Conclusion + +In this guide, you learned how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) with the [HTTPX](https://www.python-httpx.org/) in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx new file mode 100644 index 00000000..2ac4d610 --- /dev/null +++ b/docs/03_guides/02_parsel_impit.mdx @@ -0,0 +1,28 @@ +--- +id: parsel-impit +title: Using Parsel with Impit +--- + +import CodeBlock from '@theme/CodeBlock'; + +import ParselImpitExample from '!!raw-loader!./code/02_parsel_impit.py'; + +In this guide, you'll learn how to combine the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries when building Apify Actors. + +## Introduction + +[Parsel](https://github.com/scrapy/parsel) is a Python library for extracting data from HTML and XML documents using CSS selectors and [XPath](https://en.wikipedia.org/wiki/XPath) expressions. It offers an intuitive API for navigating and extracting structured data, making it a popular choice for web scraping. Compared to [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), it also delivers better performance. + +[Impit](https://github.com/apify/impit) is Apify's high-performance HTTP client for Python. It supports both synchronous and asynchronous workflows and is built for large-scale web scraping, where making thousands of requests efficiently is essential. With built-in browser impersonation and anti-blocking features, it simplifies handling modern websites. + +## Example Actor + +The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages and [Parsel](https://github.com/scrapy/parsel) to extract titles and discover new links. + + + {ParselImpitExample} + + +## Conclusion + +In this guide, you learned how to use [Parsel](https://github.com/scrapy/parsel) with [Impit](https://github.com/apify/impit) in your Apify Actors. By combining these libraries, you get a powerful and efficient solution for web scraping: [Parsel](https://github.com/scrapy/parsel) provides excellent CSS selector and XPath support for data extraction, while [Impit](https://github.com/apify/impit) offers a fast and simple HTTP client built by Apify. This combination makes it easy to build scalable web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx similarity index 100% rename from docs/02_guides/03_playwright.mdx rename to docs/03_guides/03_playwright.mdx diff --git a/docs/02_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx similarity index 100% rename from docs/02_guides/04_selenium.mdx rename to docs/03_guides/04_selenium.mdx diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx new file mode 100644 index 00000000..6b513417 --- /dev/null +++ b/docs/03_guides/05_crawlee.mdx @@ -0,0 +1,46 @@ +--- +id: crawlee +title: Using Crawlee +--- + +import CodeBlock from '@theme/CodeBlock'; + +import CrawleeBeautifulSoupExample from '!!raw-loader!./code/05_crawlee_beautifulsoup.py'; +import CrawleeParselExample from '!!raw-loader!./code/05_crawlee_parsel.py'; +import CrawleePlaywrightExample from '!!raw-loader!./code/05_crawlee_playwright.py'; + +In this guide you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. + +## Introduction + +[Crawlee](https://crawlee.dev/python) is a Python library for web scraping and browser automation that provides a robust and flexible framework for building web scraping tasks. It seamlessly integrates with the Apify platform and supports a variety of scraping techniques, from static HTML parsing to dynamic JavaScript-rendered content handling. Crawlee offers a range of crawlers, including HTTP-based crawlers like [`HttpCrawler`](https://crawlee.dev/python/api/class/HttpCrawler), [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) and [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and browser-based crawlers like [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler), to suit different scraping needs. + +In this guide, you'll learn how to use Crawlee with [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) to build Apify Actors for web scraping. + +## Actor with BeautifulSoupCrawler + +The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) is ideal for extracting data from static HTML pages. It uses [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) for parsing and [`ImpitHttpClient`](https://crawlee.dev/python/api/class/ImpitHttpClient) for HTTP communication, ensuring efficient and lightweight scraping. If you do not need to execute JavaScript on the page, [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) is a great choice for your scraping tasks. Below is an example of how to use it` in an Apify Actor. + + + {CrawleeBeautifulSoupExample} + + +## Actor with ParselCrawler + +The [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) works in the same way as [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), but it uses the [Parsel](https://parsel.readthedocs.io/en/latest/) library for HTML parsing. This allows for more powerful and flexible data extraction using [XPath](https://en.wikipedia.org/wiki/XPath) selectors. It should be faster than [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Below is an example of how to use [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) in an Apify Actor. + + + {CrawleeParselExample} + + +## Actor with PlaywrightCrawler + +The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content rendering. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor. + + + {CrawleePlaywrightExample} + + +## Conclusion + +In this guide, you learned how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. By using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/05_scrapy.mdx b/docs/03_guides/06_scrapy.mdx similarity index 100% rename from docs/02_guides/05_scrapy.mdx rename to docs/03_guides/06_scrapy.mdx diff --git a/docs/02_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py similarity index 96% rename from docs/02_guides/code/01_beautifulsoup_httpx.py rename to docs/03_guides/code/01_beautifulsoup_httpx.py index 36d3bca7..157948d0 100644 --- a/docs/02_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,9 +1,8 @@ -from __future__ import annotations - +import asyncio from urllib.parse import urljoin +import httpx from bs4 import BeautifulSoup -from httpx import AsyncClient from apify import Actor, Request @@ -32,7 +31,7 @@ async def main() -> None: await request_queue.add_request(new_request) # Create an HTTPX client to fetch the HTML content of the URLs. - async with AsyncClient() as client: + async with httpx.AsyncClient() as client: # Process the URLs from the request queue. while request := await request_queue.fetch_next_request(): url = request.url @@ -83,3 +82,7 @@ async def main() -> None: finally: # Mark the request as handled to ensure it is not processed again. await request_queue.mark_request_as_handled(new_request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py new file mode 100644 index 00000000..21b5e74f --- /dev/null +++ b/docs/03_guides/code/02_parsel_impit.py @@ -0,0 +1,94 @@ +import asyncio +from urllib.parse import urljoin + +import impit +import parsel + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Create an Impit client to fetch the HTML content of the URLs. + async with impit.AsyncClient() as client: + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fetch the HTTP response from the specified URL using Impit. + response = await client.get(url) + + # Parse the HTML content using Parsel Selector. + selector = parsel.Selector(text=response.text) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. + if depth < max_depth: + # Extract all links using CSS selector + links = selector.css('a::attr(href)').getall() + for link_href in links: + link_url = urljoin(url, link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data using Parsel selectors. + title = selector.css('title::text').get() + h1s = selector.css('h1::text').getall() + h2s = selector.css('h2::text').getall() + h3s = selector.css('h3::text').getall() + + data = { + 'url': url, + 'title': title, + 'h1s': h1s, + 'h2s': h2s, + 'h3s': h3s, + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/02_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py similarity index 98% rename from docs/02_guides/code/03_playwright.py rename to docs/03_guides/code/03_playwright.py index 78ebdda3..14868ad8 100644 --- a/docs/02_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,5 +1,4 @@ -from __future__ import annotations - +import asyncio from urllib.parse import urljoin from playwright.async_api import async_playwright @@ -92,3 +91,7 @@ async def main() -> None: await page.close() # Mark the request as handled to ensure it is not processed again. await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/02_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py similarity index 98% rename from docs/02_guides/code/04_selenium.py rename to docs/03_guides/code/04_selenium.py index 75c55b2f..8cffe606 100644 --- a/docs/02_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import asyncio from urllib.parse import urljoin @@ -102,3 +100,7 @@ async def main() -> None: await request_queue.mark_request_as_handled(request) driver.quit() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py new file mode 100644 index 00000000..4d3a81d7 --- /dev/null +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -0,0 +1,55 @@ +import asyncio + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + +from apify import Actor + +# Create a crawler. +crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py new file mode 100644 index 00000000..31f39d8b --- /dev/null +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -0,0 +1,55 @@ +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + +# Create a crawler. +crawler = ParselCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: ParselCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + 'h1s': context.selector.xpath('//h1/text()').getall(), + 'h2s': context.selector.xpath('//h2/text()').getall(), + 'h3s': context.selector.xpath('//h3/text()').getall(), + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py new file mode 100644 index 00000000..be4ea29e --- /dev/null +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -0,0 +1,58 @@ +import asyncio + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + +# Create a crawler. +crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, + # Run the browser in a headless mode. + headless=True, + browser_launch_options={'args': ['--disable-gpu']}, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: PlaywrightCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()], + 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()], + 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/02_guides/code/scrapy_project/src/__init__.py b/docs/03_guides/code/scrapy_project/src/__init__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/__init__.py rename to docs/03_guides/code/scrapy_project/src/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/__main__.py rename to docs/03_guides/code/scrapy_project/src/__main__.py diff --git a/docs/02_guides/code/scrapy_project/src/items.py b/docs/03_guides/code/scrapy_project/src/items.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/items.py rename to docs/03_guides/code/scrapy_project/src/items.py diff --git a/docs/02_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/main.py rename to docs/03_guides/code/scrapy_project/src/main.py diff --git a/docs/02_guides/code/scrapy_project/src/py.typed b/docs/03_guides/code/scrapy_project/src/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_project/src/py.typed rename to docs/03_guides/code/scrapy_project/src/py.typed diff --git a/docs/02_guides/code/scrapy_project/src/settings.py b/docs/03_guides/code/scrapy_project/src/settings.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/settings.py rename to docs/03_guides/code/scrapy_project/src/settings.py diff --git a/docs/02_guides/code/scrapy_project/src/spiders/__init__.py b/docs/03_guides/code/scrapy_project/src/spiders/__init__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/__init__.py rename to docs/03_guides/code/scrapy_project/src/spiders/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/03_guides/code/scrapy_project/src/spiders/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/py.typed rename to docs/03_guides/code/scrapy_project/src/spiders/py.typed diff --git a/docs/02_guides/code/scrapy_project/src/spiders/title.py b/docs/03_guides/code/scrapy_project/src/spiders/title.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/title.py rename to docs/03_guides/code/scrapy_project/src/spiders/title.py diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index 9365521e..410ea904 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -11,7 +11,7 @@ async def test_actor_scrapy_title_spider( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: - base_path = Path('docs/02_guides/code/scrapy_project') + base_path = Path('docs/03_guides/code/scrapy_project') actor_source_files = { 'src/__init__.py': (base_path / 'src/__init__.py').read_text(), diff --git a/website/sidebars.js b/website/sidebars.js index f6b2040e..c4a31842 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -13,23 +13,23 @@ module.exports = { }, { type: 'category', - label: 'Guides', + label: 'Concepts', collapsed: true, items: [ { type: 'autogenerated', - dirName: '02_guides', + dirName: '02_concepts', }, ], }, { type: 'category', - label: 'Concepts', + label: 'Guides', collapsed: true, items: [ { type: 'autogenerated', - dirName: '03_concepts', + dirName: '03_guides', }, ], },