From cee1d9ca74051069ed042374a048423743d04b28 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 2 Sep 2025 16:26:20 +0200 Subject: [PATCH 1/6] reorder of sections --- docs/{03_concepts => 02_concepts}/01_actor_lifecycle.mdx | 0 docs/{03_concepts => 02_concepts}/02_actor_input.mdx | 0 docs/{03_concepts => 02_concepts}/03_storages.mdx | 0 docs/{03_concepts => 02_concepts}/04_actor_events.mdx | 0 docs/{03_concepts => 02_concepts}/05_proxy_management.mdx | 0 .../06_interacting_with_other_actors.mdx | 0 docs/{03_concepts => 02_concepts}/07_webhooks.mdx | 0 docs/{03_concepts => 02_concepts}/08_access_apify_api.mdx | 0 .../{03_concepts => 02_concepts}/09_running_webserver.mdx | 0 docs/{03_concepts => 02_concepts}/10_logging.mdx | 0 docs/{03_concepts => 02_concepts}/11_configuration.mdx | 0 docs/{03_concepts => 02_concepts}/12_pay_per_event.mdx | 0 .../code/01_context_manager.py | 0 docs/{03_concepts => 02_concepts}/code/01_init_exit.py | 0 docs/{03_concepts => 02_concepts}/code/01_reboot.py | 0 .../code/01_status_message.py | 0 docs/{03_concepts => 02_concepts}/code/02_input.py | 0 .../code/03_dataset_exports.py | 0 .../code/03_dataset_read_write.py | 0 .../code/03_deleting_storages.py | 0 .../{03_concepts => 02_concepts}/code/03_kvs_iterating.py | 0 .../code/03_kvs_public_url.py | 0 .../code/03_kvs_read_write.py | 0 .../code/03_opening_storages.py | 0 docs/{03_concepts => 02_concepts}/code/03_rq.py | 0 docs/{03_concepts => 02_concepts}/code/04_actor_events.py | 0 docs/{03_concepts => 02_concepts}/code/05_apify_proxy.py | 0 .../code/05_apify_proxy_config.py | 0 docs/{03_concepts => 02_concepts}/code/05_custom_proxy.py | 0 .../code/05_custom_proxy_function.py | 0 .../code/05_proxy_actor_input.py | 0 docs/{03_concepts => 02_concepts}/code/05_proxy_httpx.py | 0 .../code/05_proxy_rotation.py | 0 .../code/06_interacting_call.py | 0 .../code/06_interacting_call_task.py | 0 .../code/06_interacting_metamorph.py | 0 .../code/06_interacting_start.py | 0 docs/{03_concepts => 02_concepts}/code/07_webhook.py | 0 .../code/07_webhook_preventing.py | 0 docs/{03_concepts => 02_concepts}/code/08_actor_client.py | 0 .../code/08_actor_new_client.py | 0 docs/{03_concepts => 02_concepts}/code/09_webserver.py | 0 docs/{03_concepts => 02_concepts}/code/10_log_config.py | 0 docs/{03_concepts => 02_concepts}/code/10_logger_usage.py | 0 docs/{03_concepts => 02_concepts}/code/10_redirect_log.py | 0 .../code/10_redirect_log_existing_run.py | 0 docs/{03_concepts => 02_concepts}/code/11_config.py | 0 docs/{03_concepts => 02_concepts}/code/actor_charge.py | 0 .../code/conditional_actor_charge.py | 0 docs/{02_guides => 03_guides}/01_beautifulsoup_httpx.mdx | 0 docs/{02_guides => 03_guides}/02_crawlee.mdx | 0 docs/{02_guides => 03_guides}/03_playwright.mdx | 0 docs/{02_guides => 03_guides}/04_selenium.mdx | 0 docs/{02_guides => 03_guides}/05_scrapy.mdx | 0 .../code/01_beautifulsoup_httpx.py | 0 .../code/02_crawlee_beautifulsoup.py | 0 .../code/02_crawlee_playwright.py | 0 docs/{02_guides => 03_guides}/code/03_playwright.py | 0 docs/{02_guides => 03_guides}/code/04_selenium.py | 0 .../code/scrapy_project/src/__init__.py | 0 .../code/scrapy_project/src/__main__.py | 0 .../code/scrapy_project/src/items.py | 0 .../code/scrapy_project/src/main.py | 0 .../code/scrapy_project/src/py.typed | 0 .../code/scrapy_project/src/settings.py | 0 .../code/scrapy_project/src/spiders/__init__.py | 0 .../code/scrapy_project/src/spiders/py.typed | 0 .../code/scrapy_project/src/spiders/title.py | 0 website/sidebars.js | 8 ++++---- 69 files changed, 4 insertions(+), 4 deletions(-) rename docs/{03_concepts => 02_concepts}/01_actor_lifecycle.mdx (100%) rename docs/{03_concepts => 02_concepts}/02_actor_input.mdx (100%) rename docs/{03_concepts => 02_concepts}/03_storages.mdx (100%) rename docs/{03_concepts => 02_concepts}/04_actor_events.mdx (100%) rename docs/{03_concepts => 02_concepts}/05_proxy_management.mdx (100%) rename docs/{03_concepts => 02_concepts}/06_interacting_with_other_actors.mdx (100%) rename docs/{03_concepts => 02_concepts}/07_webhooks.mdx (100%) rename docs/{03_concepts => 02_concepts}/08_access_apify_api.mdx (100%) rename docs/{03_concepts => 02_concepts}/09_running_webserver.mdx (100%) rename docs/{03_concepts => 02_concepts}/10_logging.mdx (100%) rename docs/{03_concepts => 02_concepts}/11_configuration.mdx (100%) rename docs/{03_concepts => 02_concepts}/12_pay_per_event.mdx (100%) rename docs/{03_concepts => 02_concepts}/code/01_context_manager.py (100%) rename docs/{03_concepts => 02_concepts}/code/01_init_exit.py (100%) rename docs/{03_concepts => 02_concepts}/code/01_reboot.py (100%) rename docs/{03_concepts => 02_concepts}/code/01_status_message.py (100%) rename docs/{03_concepts => 02_concepts}/code/02_input.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_dataset_exports.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_dataset_read_write.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_deleting_storages.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_kvs_iterating.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_kvs_public_url.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_kvs_read_write.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_opening_storages.py (100%) rename docs/{03_concepts => 02_concepts}/code/03_rq.py (100%) rename docs/{03_concepts => 02_concepts}/code/04_actor_events.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_apify_proxy.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_apify_proxy_config.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_custom_proxy.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_custom_proxy_function.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_proxy_actor_input.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_proxy_httpx.py (100%) rename docs/{03_concepts => 02_concepts}/code/05_proxy_rotation.py (100%) rename docs/{03_concepts => 02_concepts}/code/06_interacting_call.py (100%) rename docs/{03_concepts => 02_concepts}/code/06_interacting_call_task.py (100%) rename docs/{03_concepts => 02_concepts}/code/06_interacting_metamorph.py (100%) rename docs/{03_concepts => 02_concepts}/code/06_interacting_start.py (100%) rename docs/{03_concepts => 02_concepts}/code/07_webhook.py (100%) rename docs/{03_concepts => 02_concepts}/code/07_webhook_preventing.py (100%) rename docs/{03_concepts => 02_concepts}/code/08_actor_client.py (100%) rename docs/{03_concepts => 02_concepts}/code/08_actor_new_client.py (100%) rename docs/{03_concepts => 02_concepts}/code/09_webserver.py (100%) rename docs/{03_concepts => 02_concepts}/code/10_log_config.py (100%) rename docs/{03_concepts => 02_concepts}/code/10_logger_usage.py (100%) rename docs/{03_concepts => 02_concepts}/code/10_redirect_log.py (100%) rename docs/{03_concepts => 02_concepts}/code/10_redirect_log_existing_run.py (100%) rename docs/{03_concepts => 02_concepts}/code/11_config.py (100%) rename docs/{03_concepts => 02_concepts}/code/actor_charge.py (100%) rename docs/{03_concepts => 02_concepts}/code/conditional_actor_charge.py (100%) rename docs/{02_guides => 03_guides}/01_beautifulsoup_httpx.mdx (100%) rename docs/{02_guides => 03_guides}/02_crawlee.mdx (100%) rename docs/{02_guides => 03_guides}/03_playwright.mdx (100%) rename docs/{02_guides => 03_guides}/04_selenium.mdx (100%) rename docs/{02_guides => 03_guides}/05_scrapy.mdx (100%) rename docs/{02_guides => 03_guides}/code/01_beautifulsoup_httpx.py (100%) rename docs/{02_guides => 03_guides}/code/02_crawlee_beautifulsoup.py (100%) rename docs/{02_guides => 03_guides}/code/02_crawlee_playwright.py (100%) rename docs/{02_guides => 03_guides}/code/03_playwright.py (100%) rename docs/{02_guides => 03_guides}/code/04_selenium.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/__init__.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/__main__.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/items.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/main.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/py.typed (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/settings.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/spiders/__init__.py (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/spiders/py.typed (100%) rename docs/{02_guides => 03_guides}/code/scrapy_project/src/spiders/title.py (100%) diff --git a/docs/03_concepts/01_actor_lifecycle.mdx b/docs/02_concepts/01_actor_lifecycle.mdx similarity index 100% rename from docs/03_concepts/01_actor_lifecycle.mdx rename to docs/02_concepts/01_actor_lifecycle.mdx diff --git a/docs/03_concepts/02_actor_input.mdx b/docs/02_concepts/02_actor_input.mdx similarity index 100% rename from docs/03_concepts/02_actor_input.mdx rename to docs/02_concepts/02_actor_input.mdx diff --git a/docs/03_concepts/03_storages.mdx b/docs/02_concepts/03_storages.mdx similarity index 100% rename from docs/03_concepts/03_storages.mdx rename to docs/02_concepts/03_storages.mdx diff --git a/docs/03_concepts/04_actor_events.mdx b/docs/02_concepts/04_actor_events.mdx similarity index 100% rename from docs/03_concepts/04_actor_events.mdx rename to docs/02_concepts/04_actor_events.mdx diff --git a/docs/03_concepts/05_proxy_management.mdx b/docs/02_concepts/05_proxy_management.mdx similarity index 100% rename from docs/03_concepts/05_proxy_management.mdx rename to docs/02_concepts/05_proxy_management.mdx diff --git a/docs/03_concepts/06_interacting_with_other_actors.mdx b/docs/02_concepts/06_interacting_with_other_actors.mdx similarity index 100% rename from docs/03_concepts/06_interacting_with_other_actors.mdx rename to docs/02_concepts/06_interacting_with_other_actors.mdx diff --git a/docs/03_concepts/07_webhooks.mdx b/docs/02_concepts/07_webhooks.mdx similarity index 100% rename from docs/03_concepts/07_webhooks.mdx rename to docs/02_concepts/07_webhooks.mdx diff --git a/docs/03_concepts/08_access_apify_api.mdx b/docs/02_concepts/08_access_apify_api.mdx similarity index 100% rename from docs/03_concepts/08_access_apify_api.mdx rename to docs/02_concepts/08_access_apify_api.mdx diff --git a/docs/03_concepts/09_running_webserver.mdx b/docs/02_concepts/09_running_webserver.mdx similarity index 100% rename from docs/03_concepts/09_running_webserver.mdx rename to docs/02_concepts/09_running_webserver.mdx diff --git a/docs/03_concepts/10_logging.mdx b/docs/02_concepts/10_logging.mdx similarity index 100% rename from docs/03_concepts/10_logging.mdx rename to docs/02_concepts/10_logging.mdx diff --git a/docs/03_concepts/11_configuration.mdx b/docs/02_concepts/11_configuration.mdx similarity index 100% rename from docs/03_concepts/11_configuration.mdx rename to docs/02_concepts/11_configuration.mdx diff --git a/docs/03_concepts/12_pay_per_event.mdx b/docs/02_concepts/12_pay_per_event.mdx similarity index 100% rename from docs/03_concepts/12_pay_per_event.mdx rename to docs/02_concepts/12_pay_per_event.mdx diff --git a/docs/03_concepts/code/01_context_manager.py b/docs/02_concepts/code/01_context_manager.py similarity index 100% rename from docs/03_concepts/code/01_context_manager.py rename to docs/02_concepts/code/01_context_manager.py diff --git a/docs/03_concepts/code/01_init_exit.py b/docs/02_concepts/code/01_init_exit.py similarity index 100% rename from docs/03_concepts/code/01_init_exit.py rename to docs/02_concepts/code/01_init_exit.py diff --git a/docs/03_concepts/code/01_reboot.py b/docs/02_concepts/code/01_reboot.py similarity index 100% rename from docs/03_concepts/code/01_reboot.py rename to docs/02_concepts/code/01_reboot.py diff --git a/docs/03_concepts/code/01_status_message.py b/docs/02_concepts/code/01_status_message.py similarity index 100% rename from docs/03_concepts/code/01_status_message.py rename to docs/02_concepts/code/01_status_message.py diff --git a/docs/03_concepts/code/02_input.py b/docs/02_concepts/code/02_input.py similarity index 100% rename from docs/03_concepts/code/02_input.py rename to docs/02_concepts/code/02_input.py diff --git a/docs/03_concepts/code/03_dataset_exports.py b/docs/02_concepts/code/03_dataset_exports.py similarity index 100% rename from docs/03_concepts/code/03_dataset_exports.py rename to docs/02_concepts/code/03_dataset_exports.py diff --git a/docs/03_concepts/code/03_dataset_read_write.py b/docs/02_concepts/code/03_dataset_read_write.py similarity index 100% rename from docs/03_concepts/code/03_dataset_read_write.py rename to docs/02_concepts/code/03_dataset_read_write.py diff --git a/docs/03_concepts/code/03_deleting_storages.py b/docs/02_concepts/code/03_deleting_storages.py similarity index 100% rename from docs/03_concepts/code/03_deleting_storages.py rename to docs/02_concepts/code/03_deleting_storages.py diff --git a/docs/03_concepts/code/03_kvs_iterating.py b/docs/02_concepts/code/03_kvs_iterating.py similarity index 100% rename from docs/03_concepts/code/03_kvs_iterating.py rename to docs/02_concepts/code/03_kvs_iterating.py diff --git a/docs/03_concepts/code/03_kvs_public_url.py b/docs/02_concepts/code/03_kvs_public_url.py similarity index 100% rename from docs/03_concepts/code/03_kvs_public_url.py rename to docs/02_concepts/code/03_kvs_public_url.py diff --git a/docs/03_concepts/code/03_kvs_read_write.py b/docs/02_concepts/code/03_kvs_read_write.py similarity index 100% rename from docs/03_concepts/code/03_kvs_read_write.py rename to docs/02_concepts/code/03_kvs_read_write.py diff --git a/docs/03_concepts/code/03_opening_storages.py b/docs/02_concepts/code/03_opening_storages.py similarity index 100% rename from docs/03_concepts/code/03_opening_storages.py rename to docs/02_concepts/code/03_opening_storages.py diff --git a/docs/03_concepts/code/03_rq.py b/docs/02_concepts/code/03_rq.py similarity index 100% rename from docs/03_concepts/code/03_rq.py rename to docs/02_concepts/code/03_rq.py diff --git a/docs/03_concepts/code/04_actor_events.py b/docs/02_concepts/code/04_actor_events.py similarity index 100% rename from docs/03_concepts/code/04_actor_events.py rename to docs/02_concepts/code/04_actor_events.py diff --git a/docs/03_concepts/code/05_apify_proxy.py b/docs/02_concepts/code/05_apify_proxy.py similarity index 100% rename from docs/03_concepts/code/05_apify_proxy.py rename to docs/02_concepts/code/05_apify_proxy.py diff --git a/docs/03_concepts/code/05_apify_proxy_config.py b/docs/02_concepts/code/05_apify_proxy_config.py similarity index 100% rename from docs/03_concepts/code/05_apify_proxy_config.py rename to docs/02_concepts/code/05_apify_proxy_config.py diff --git a/docs/03_concepts/code/05_custom_proxy.py b/docs/02_concepts/code/05_custom_proxy.py similarity index 100% rename from docs/03_concepts/code/05_custom_proxy.py rename to docs/02_concepts/code/05_custom_proxy.py diff --git a/docs/03_concepts/code/05_custom_proxy_function.py b/docs/02_concepts/code/05_custom_proxy_function.py similarity index 100% rename from docs/03_concepts/code/05_custom_proxy_function.py rename to docs/02_concepts/code/05_custom_proxy_function.py diff --git a/docs/03_concepts/code/05_proxy_actor_input.py b/docs/02_concepts/code/05_proxy_actor_input.py similarity index 100% rename from docs/03_concepts/code/05_proxy_actor_input.py rename to docs/02_concepts/code/05_proxy_actor_input.py diff --git a/docs/03_concepts/code/05_proxy_httpx.py b/docs/02_concepts/code/05_proxy_httpx.py similarity index 100% rename from docs/03_concepts/code/05_proxy_httpx.py rename to docs/02_concepts/code/05_proxy_httpx.py diff --git a/docs/03_concepts/code/05_proxy_rotation.py b/docs/02_concepts/code/05_proxy_rotation.py similarity index 100% rename from docs/03_concepts/code/05_proxy_rotation.py rename to docs/02_concepts/code/05_proxy_rotation.py diff --git a/docs/03_concepts/code/06_interacting_call.py b/docs/02_concepts/code/06_interacting_call.py similarity index 100% rename from docs/03_concepts/code/06_interacting_call.py rename to docs/02_concepts/code/06_interacting_call.py diff --git a/docs/03_concepts/code/06_interacting_call_task.py b/docs/02_concepts/code/06_interacting_call_task.py similarity index 100% rename from docs/03_concepts/code/06_interacting_call_task.py rename to docs/02_concepts/code/06_interacting_call_task.py diff --git a/docs/03_concepts/code/06_interacting_metamorph.py b/docs/02_concepts/code/06_interacting_metamorph.py similarity index 100% rename from docs/03_concepts/code/06_interacting_metamorph.py rename to docs/02_concepts/code/06_interacting_metamorph.py diff --git a/docs/03_concepts/code/06_interacting_start.py b/docs/02_concepts/code/06_interacting_start.py similarity index 100% rename from docs/03_concepts/code/06_interacting_start.py rename to docs/02_concepts/code/06_interacting_start.py diff --git a/docs/03_concepts/code/07_webhook.py b/docs/02_concepts/code/07_webhook.py similarity index 100% rename from docs/03_concepts/code/07_webhook.py rename to docs/02_concepts/code/07_webhook.py diff --git a/docs/03_concepts/code/07_webhook_preventing.py b/docs/02_concepts/code/07_webhook_preventing.py similarity index 100% rename from docs/03_concepts/code/07_webhook_preventing.py rename to docs/02_concepts/code/07_webhook_preventing.py diff --git a/docs/03_concepts/code/08_actor_client.py b/docs/02_concepts/code/08_actor_client.py similarity index 100% rename from docs/03_concepts/code/08_actor_client.py rename to docs/02_concepts/code/08_actor_client.py diff --git a/docs/03_concepts/code/08_actor_new_client.py b/docs/02_concepts/code/08_actor_new_client.py similarity index 100% rename from docs/03_concepts/code/08_actor_new_client.py rename to docs/02_concepts/code/08_actor_new_client.py diff --git a/docs/03_concepts/code/09_webserver.py b/docs/02_concepts/code/09_webserver.py similarity index 100% rename from docs/03_concepts/code/09_webserver.py rename to docs/02_concepts/code/09_webserver.py diff --git a/docs/03_concepts/code/10_log_config.py b/docs/02_concepts/code/10_log_config.py similarity index 100% rename from docs/03_concepts/code/10_log_config.py rename to docs/02_concepts/code/10_log_config.py diff --git a/docs/03_concepts/code/10_logger_usage.py b/docs/02_concepts/code/10_logger_usage.py similarity index 100% rename from docs/03_concepts/code/10_logger_usage.py rename to docs/02_concepts/code/10_logger_usage.py diff --git a/docs/03_concepts/code/10_redirect_log.py b/docs/02_concepts/code/10_redirect_log.py similarity index 100% rename from docs/03_concepts/code/10_redirect_log.py rename to docs/02_concepts/code/10_redirect_log.py diff --git a/docs/03_concepts/code/10_redirect_log_existing_run.py b/docs/02_concepts/code/10_redirect_log_existing_run.py similarity index 100% rename from docs/03_concepts/code/10_redirect_log_existing_run.py rename to docs/02_concepts/code/10_redirect_log_existing_run.py diff --git a/docs/03_concepts/code/11_config.py b/docs/02_concepts/code/11_config.py similarity index 100% rename from docs/03_concepts/code/11_config.py rename to docs/02_concepts/code/11_config.py diff --git a/docs/03_concepts/code/actor_charge.py b/docs/02_concepts/code/actor_charge.py similarity index 100% rename from docs/03_concepts/code/actor_charge.py rename to docs/02_concepts/code/actor_charge.py diff --git a/docs/03_concepts/code/conditional_actor_charge.py b/docs/02_concepts/code/conditional_actor_charge.py similarity index 100% rename from docs/03_concepts/code/conditional_actor_charge.py rename to docs/02_concepts/code/conditional_actor_charge.py diff --git a/docs/02_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx similarity index 100% rename from docs/02_guides/01_beautifulsoup_httpx.mdx rename to docs/03_guides/01_beautifulsoup_httpx.mdx diff --git a/docs/02_guides/02_crawlee.mdx b/docs/03_guides/02_crawlee.mdx similarity index 100% rename from docs/02_guides/02_crawlee.mdx rename to docs/03_guides/02_crawlee.mdx diff --git a/docs/02_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx similarity index 100% rename from docs/02_guides/03_playwright.mdx rename to docs/03_guides/03_playwright.mdx diff --git a/docs/02_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx similarity index 100% rename from docs/02_guides/04_selenium.mdx rename to docs/03_guides/04_selenium.mdx diff --git a/docs/02_guides/05_scrapy.mdx b/docs/03_guides/05_scrapy.mdx similarity index 100% rename from docs/02_guides/05_scrapy.mdx rename to docs/03_guides/05_scrapy.mdx diff --git a/docs/02_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py similarity index 100% rename from docs/02_guides/code/01_beautifulsoup_httpx.py rename to docs/03_guides/code/01_beautifulsoup_httpx.py diff --git a/docs/02_guides/code/02_crawlee_beautifulsoup.py b/docs/03_guides/code/02_crawlee_beautifulsoup.py similarity index 100% rename from docs/02_guides/code/02_crawlee_beautifulsoup.py rename to docs/03_guides/code/02_crawlee_beautifulsoup.py diff --git a/docs/02_guides/code/02_crawlee_playwright.py b/docs/03_guides/code/02_crawlee_playwright.py similarity index 100% rename from docs/02_guides/code/02_crawlee_playwright.py rename to docs/03_guides/code/02_crawlee_playwright.py diff --git a/docs/02_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py similarity index 100% rename from docs/02_guides/code/03_playwright.py rename to docs/03_guides/code/03_playwright.py diff --git a/docs/02_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py similarity index 100% rename from docs/02_guides/code/04_selenium.py rename to docs/03_guides/code/04_selenium.py diff --git a/docs/02_guides/code/scrapy_project/src/__init__.py b/docs/03_guides/code/scrapy_project/src/__init__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/__init__.py rename to docs/03_guides/code/scrapy_project/src/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/__main__.py rename to docs/03_guides/code/scrapy_project/src/__main__.py diff --git a/docs/02_guides/code/scrapy_project/src/items.py b/docs/03_guides/code/scrapy_project/src/items.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/items.py rename to docs/03_guides/code/scrapy_project/src/items.py diff --git a/docs/02_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/main.py rename to docs/03_guides/code/scrapy_project/src/main.py diff --git a/docs/02_guides/code/scrapy_project/src/py.typed b/docs/03_guides/code/scrapy_project/src/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_project/src/py.typed rename to docs/03_guides/code/scrapy_project/src/py.typed diff --git a/docs/02_guides/code/scrapy_project/src/settings.py b/docs/03_guides/code/scrapy_project/src/settings.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/settings.py rename to docs/03_guides/code/scrapy_project/src/settings.py diff --git a/docs/02_guides/code/scrapy_project/src/spiders/__init__.py b/docs/03_guides/code/scrapy_project/src/spiders/__init__.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/__init__.py rename to docs/03_guides/code/scrapy_project/src/spiders/__init__.py diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/03_guides/code/scrapy_project/src/spiders/py.typed similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/py.typed rename to docs/03_guides/code/scrapy_project/src/spiders/py.typed diff --git a/docs/02_guides/code/scrapy_project/src/spiders/title.py b/docs/03_guides/code/scrapy_project/src/spiders/title.py similarity index 100% rename from docs/02_guides/code/scrapy_project/src/spiders/title.py rename to docs/03_guides/code/scrapy_project/src/spiders/title.py diff --git a/website/sidebars.js b/website/sidebars.js index f6b2040e..c4a31842 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -13,23 +13,23 @@ module.exports = { }, { type: 'category', - label: 'Guides', + label: 'Concepts', collapsed: true, items: [ { type: 'autogenerated', - dirName: '02_guides', + dirName: '02_concepts', }, ], }, { type: 'category', - label: 'Concepts', + label: 'Guides', collapsed: true, items: [ { type: 'autogenerated', - dirName: '03_concepts', + dirName: '03_guides', }, ], }, From eaef0aaa26ce872260160a1ef5b68a7192bcab03 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 2 Sep 2025 16:47:07 +0200 Subject: [PATCH 2/6] add crawlee parsel --- docs/03_guides/02_crawlee.mdx | 19 +++++--- docs/03_guides/code/02_crawlee_parsel.py | 55 ++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 docs/03_guides/code/02_crawlee_parsel.py diff --git a/docs/03_guides/02_crawlee.mdx b/docs/03_guides/02_crawlee.mdx index b040cad2..0b0c0e2c 100644 --- a/docs/03_guides/02_crawlee.mdx +++ b/docs/03_guides/02_crawlee.mdx @@ -6,27 +6,36 @@ title: Using Crawlee import CodeBlock from '@theme/CodeBlock'; import CrawleeBeautifulSoupExample from '!!raw-loader!./code/02_crawlee_beautifulsoup.py'; +import CrawleeParselExample from '!!raw-loader!./code/02_crawlee_parsel.py'; import CrawleePlaywrightExample from '!!raw-loader!./code/02_crawlee_playwright.py'; In this guide you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. ## Introduction -`Crawlee` is a Python library for web scraping and browser automation that provides a robust and flexible framework for building web scraping tasks. It seamlessly integrates with the Apify platform and supports a variety of scraping techniques, from static HTML parsing to dynamic JavaScript-rendered content handling. Crawlee offers a range of crawlers, including HTTP-based crawlers like [`HttpCrawler`](https://crawlee.dev/python/api/class/HttpCrawler), [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) and [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and browser-based crawlers like [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler), to suit different scraping needs. +[Crawlee](https://crawlee.dev/python) is a Python library for web scraping and browser automation that provides a robust and flexible framework for building web scraping tasks. It seamlessly integrates with the Apify platform and supports a variety of scraping techniques, from static HTML parsing to dynamic JavaScript-rendered content handling. Crawlee offers a range of crawlers, including HTTP-based crawlers like [`HttpCrawler`](https://crawlee.dev/python/api/class/HttpCrawler), [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) and [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and browser-based crawlers like [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler), to suit different scraping needs. -In this guide, you'll learn how to use Crawlee with `BeautifulSoupCrawler` and `PlaywrightCrawler` to build Apify Actors for web scraping. +In this guide, you'll learn how to use Crawlee with [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) to build Apify Actors for web scraping. ## Actor with BeautifulSoupCrawler -The `BeautifulSoupCrawler` is ideal for extracting data from static HTML pages. It uses `BeautifulSoup` for parsing and [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication, ensuring efficient and lightweight scraping. If you do not need to execute JavaScript on the page, `BeautifulSoupCrawler` is a great choice for your scraping tasks. Below is an example of how to use `BeautifulSoupCrawler` in an Apify Actor. +The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) is ideal for extracting data from static HTML pages. It uses [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) for parsing and [`ImpitHttpClient`](https://crawlee.dev/python/api/class/ImpitHttpClient) for HTTP communication, ensuring efficient and lightweight scraping. If you do not need to execute JavaScript on the page, [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) is a great choice for your scraping tasks. Below is an example of how to use it` in an Apify Actor. {CrawleeBeautifulSoupExample} +## Actor with ParselCrawler + +The [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) works in the same way as [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), but it uses the [Parsel](https://parsel.readthedocs.io/en/latest/) library for HTML parsing. This allows for more powerful and flexible data extraction using [XPath](https://en.wikipedia.org/wiki/XPath) selectors. It should be faster than [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Below is an example of how to use [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) in an Apify Actor. + + + {CrawleeParselExample} + + ## Actor with PlaywrightCrawler -The `PlaywrightCrawler` is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use `PlaywrightCrawler` in an Apify Actor. +The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor. {CrawleePlaywrightExample} @@ -34,4 +43,4 @@ The `PlaywrightCrawler` is built for handling dynamic web pages that rely on Jav ## Conclusion -In this guide, you learned how to use the `Crawlee` library in your Apify Actors. By using the `BeautifulSoupCrawler` and `PlaywrightCrawler` crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +In this guide, you learned how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. By using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/code/02_crawlee_parsel.py b/docs/03_guides/code/02_crawlee_parsel.py new file mode 100644 index 00000000..822754b9 --- /dev/null +++ b/docs/03_guides/code/02_crawlee_parsel.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get( + 'start_urls', + [{'url': 'https://apify.com'}], + ) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a crawler. + crawler = ParselCrawler( + # Limit the crawl to max requests. + # Remove or increase it for crawling all links. + max_requests_per_crawl=50, + ) + + # Define a request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + url = context.request.url + Actor.log.info(f'Scraping {url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + 'h1s': context.selector.xpath('//h1/text()').getall(), + 'h2s': context.selector.xpath('//h2/text()').getall(), + 'h3s': context.selector.xpath('//h3/text()').getall(), + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) From f3e3473c8a376c017df3721ecab585521e965360 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Sep 2025 14:55:02 +0200 Subject: [PATCH 3/6] add parsel impit --- docs/03_guides/01_beautifulsoup_httpx.mdx | 10 +-- docs/03_guides/02_parsel_impit.mdx | 28 ++++++ .../{02_crawlee.mdx => 05_crawlee.mdx} | 6 +- .../{05_scrapy.mdx => 06_scrapy.mdx} | 0 docs/03_guides/code/01_beautifulsoup_httpx.py | 6 +- docs/03_guides/code/02_parsel_impit.py | 89 +++++++++++++++++++ docs/03_guides/code/03_playwright.py | 2 - docs/03_guides/code/04_selenium.py | 2 - ...fulsoup.py => 05_crawlee_beautifulsoup.py} | 2 - ...crawlee_parsel.py => 05_crawlee_parsel.py} | 0 ...playwright.py => 05_crawlee_playwright.py} | 2 - 11 files changed, 127 insertions(+), 20 deletions(-) create mode 100644 docs/03_guides/02_parsel_impit.mdx rename docs/03_guides/{02_crawlee.mdx => 05_crawlee.mdx} (95%) rename docs/03_guides/{05_scrapy.mdx => 06_scrapy.mdx} (100%) create mode 100644 docs/03_guides/code/02_parsel_impit.py rename docs/03_guides/code/{02_crawlee_beautifulsoup.py => 05_crawlee_beautifulsoup.py} (98%) rename docs/03_guides/code/{02_crawlee_parsel.py => 05_crawlee_parsel.py} (100%) rename docs/03_guides/code/{02_crawlee_playwright.py => 05_crawlee_playwright.py} (98%) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index 4ecabd6e..b6a69c01 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -11,15 +11,15 @@ In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.co ## Introduction -`BeautifulSoup` is a Python library for extracting data from HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, enabling efficient data extraction. +[BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) is a Python library for extracting data from HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, enabling efficient data extraction. -`HTTPX` is a modern, high-level HTTP client library for Python. It provides a simple interface for making HTTP requests and supports both synchronous and asynchronous requests. +[HTTPX](https://www.python-httpx.org/) is a modern, high-level HTTP client library for Python. It provides a simple interface for making HTTP requests and supports both synchronous and asynchronous requests. -To create an `Actor` which uses those libraries, start from the [BeautifulSoup & Python](https://apify.com/templates/categories/python) Actor template. This template includes the `BeautifulSoup` and `HTTPX` libraries preinstalled, allowing you to begin development immediately. +To create an Actor which uses those libraries, start from the [BeautifulSoup & Python](https://apify.com/templates/categories/python) Actor template. This template includes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) and [HTTPX](https://www.python-httpx.org/) libraries preinstalled, allowing you to begin development immediately. ## Example Actor -Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses `HTTPX` for fetching pages and `BeautifulSoup` for parsing their content to extract titles and links to other pages. +Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract titles and links to other pages. {BeautifulSoupHttpxExample} @@ -27,4 +27,4 @@ Below is a simple Actor that recursively scrapes titles from all linked websites ## Conclusion -In this guide, you learned how to use the `BeautifulSoup` with the `HTTPX` in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +In this guide, you learned how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) with the [HTTPX](https://www.python-httpx.org/) in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx new file mode 100644 index 00000000..2ac4d610 --- /dev/null +++ b/docs/03_guides/02_parsel_impit.mdx @@ -0,0 +1,28 @@ +--- +id: parsel-impit +title: Using Parsel with Impit +--- + +import CodeBlock from '@theme/CodeBlock'; + +import ParselImpitExample from '!!raw-loader!./code/02_parsel_impit.py'; + +In this guide, you'll learn how to combine the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries when building Apify Actors. + +## Introduction + +[Parsel](https://github.com/scrapy/parsel) is a Python library for extracting data from HTML and XML documents using CSS selectors and [XPath](https://en.wikipedia.org/wiki/XPath) expressions. It offers an intuitive API for navigating and extracting structured data, making it a popular choice for web scraping. Compared to [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), it also delivers better performance. + +[Impit](https://github.com/apify/impit) is Apify's high-performance HTTP client for Python. It supports both synchronous and asynchronous workflows and is built for large-scale web scraping, where making thousands of requests efficiently is essential. With built-in browser impersonation and anti-blocking features, it simplifies handling modern websites. + +## Example Actor + +The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages and [Parsel](https://github.com/scrapy/parsel) to extract titles and discover new links. + + + {ParselImpitExample} + + +## Conclusion + +In this guide, you learned how to use [Parsel](https://github.com/scrapy/parsel) with [Impit](https://github.com/apify/impit) in your Apify Actors. By combining these libraries, you get a powerful and efficient solution for web scraping: [Parsel](https://github.com/scrapy/parsel) provides excellent CSS selector and XPath support for data extraction, while [Impit](https://github.com/apify/impit) offers a fast and simple HTTP client built by Apify. This combination makes it easy to build scalable web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/02_crawlee.mdx b/docs/03_guides/05_crawlee.mdx similarity index 95% rename from docs/03_guides/02_crawlee.mdx rename to docs/03_guides/05_crawlee.mdx index 0b0c0e2c..7dc5b6e1 100644 --- a/docs/03_guides/02_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -5,9 +5,9 @@ title: Using Crawlee import CodeBlock from '@theme/CodeBlock'; -import CrawleeBeautifulSoupExample from '!!raw-loader!./code/02_crawlee_beautifulsoup.py'; -import CrawleeParselExample from '!!raw-loader!./code/02_crawlee_parsel.py'; -import CrawleePlaywrightExample from '!!raw-loader!./code/02_crawlee_playwright.py'; +import CrawleeBeautifulSoupExample from '!!raw-loader!./code/05_crawlee_beautifulsoup.py'; +import CrawleeParselExample from '!!raw-loader!./code/05_crawlee_parsel.py'; +import CrawleePlaywrightExample from '!!raw-loader!./code/05_crawlee_playwright.py'; In this guide you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. diff --git a/docs/03_guides/05_scrapy.mdx b/docs/03_guides/06_scrapy.mdx similarity index 100% rename from docs/03_guides/05_scrapy.mdx rename to docs/03_guides/06_scrapy.mdx diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 36d3bca7..14a9cd78 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,9 +1,7 @@ -from __future__ import annotations - from urllib.parse import urljoin +import httpx from bs4 import BeautifulSoup -from httpx import AsyncClient from apify import Actor, Request @@ -32,7 +30,7 @@ async def main() -> None: await request_queue.add_request(new_request) # Create an HTTPX client to fetch the HTML content of the URLs. - async with AsyncClient() as client: + async with httpx.AsyncClient() as client: # Process the URLs from the request queue. while request := await request_queue.fetch_next_request(): url = request.url diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py new file mode 100644 index 00000000..a4175d61 --- /dev/null +++ b/docs/03_guides/code/02_parsel_impit.py @@ -0,0 +1,89 @@ +from urllib.parse import urljoin + +import impit +import parsel + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Create an Impit client to fetch the HTML content of the URLs. + async with impit.AsyncClient() as client: + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fetch the HTTP response from the specified URL using Impit. + response = await client.get(url) + + # Parse the HTML content using Parsel Selector. + selector = parsel.Selector(text=response.text) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. + if depth < max_depth: + # Extract all links using CSS selector + links = selector.css('a::attr(href)').getall() + for link_href in links: + link_url = urljoin(url, link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data using Parsel selectors. + title = selector.css('title::text').get() + h1s = selector.css('h1::text').getall() + h2s = selector.css('h2::text').getall() + h3s = selector.css('h3::text').getall() + + data = { + 'url': url, + 'title': title, + 'h1s': h1s, + 'h2s': h2s, + 'h3s': h3s, + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 78ebdda3..be68cc49 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from urllib.parse import urljoin from playwright.async_api import async_playwright diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 75c55b2f..ffd1cc30 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import asyncio from urllib.parse import urljoin diff --git a/docs/03_guides/code/02_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py similarity index 98% rename from docs/03_guides/code/02_crawlee_beautifulsoup.py rename to docs/03_guides/code/05_crawlee_beautifulsoup.py index e2dba8a1..676ebab1 100644 --- a/docs/03_guides/code/02_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from apify import Actor diff --git a/docs/03_guides/code/02_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py similarity index 100% rename from docs/03_guides/code/02_crawlee_parsel.py rename to docs/03_guides/code/05_crawlee_parsel.py diff --git a/docs/03_guides/code/02_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py similarity index 98% rename from docs/03_guides/code/02_crawlee_playwright.py rename to docs/03_guides/code/05_crawlee_playwright.py index 2f0f110f..745fd92f 100644 --- a/docs/03_guides/code/02_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from apify import Actor From d8e3b5168590c5a6df862e6e5887c4900fabde78 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Sep 2025 15:47:34 +0200 Subject: [PATCH 4/6] fix scrapy integration test path --- tests/integration/test_actor_scrapy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py index 9365521e..410ea904 100644 --- a/tests/integration/test_actor_scrapy.py +++ b/tests/integration/test_actor_scrapy.py @@ -11,7 +11,7 @@ async def test_actor_scrapy_title_spider( make_actor: MakeActorFunction, run_actor: RunActorFunction, ) -> None: - base_path = Path('docs/02_guides/code/scrapy_project') + base_path = Path('docs/03_guides/code/scrapy_project') actor_source_files = { 'src/__init__.py': (base_path / 'src/__init__.py').read_text(), From 43425635ef1dcb139405b1833410dcd4dd1518b3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Sep 2025 15:49:28 +0200 Subject: [PATCH 5/6] rm unnecessary future import --- docs/03_guides/code/05_crawlee_parsel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index 822754b9..71ac6c81 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from apify import Actor From 704b76d8cd1f99bbb8b0856791e8d5ed0481b3dd Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Sep 2025 16:25:01 +0200 Subject: [PATCH 6/6] address the feedback --- docs/03_guides/05_crawlee.mdx | 2 +- docs/03_guides/code/01_beautifulsoup_httpx.py | 5 ++ docs/03_guides/code/02_parsel_impit.py | 5 ++ docs/03_guides/code/03_playwright.py | 5 ++ docs/03_guides/code/04_selenium.py | 4 + .../code/05_crawlee_beautifulsoup.py | 66 +++++++-------- docs/03_guides/code/05_crawlee_parsel.py | 66 +++++++-------- docs/03_guides/code/05_crawlee_playwright.py | 82 +++++++++---------- 8 files changed, 125 insertions(+), 110 deletions(-) diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index 7dc5b6e1..6b513417 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -35,7 +35,7 @@ The [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) works ## Actor with PlaywrightCrawler -The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor. +The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content rendering. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor. {CrawleePlaywrightExample} diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 14a9cd78..157948d0 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,3 +1,4 @@ +import asyncio from urllib.parse import urljoin import httpx @@ -81,3 +82,7 @@ async def main() -> None: finally: # Mark the request as handled to ensure it is not processed again. await request_queue.mark_request_as_handled(new_request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index a4175d61..21b5e74f 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -1,3 +1,4 @@ +import asyncio from urllib.parse import urljoin import impit @@ -87,3 +88,7 @@ async def main() -> None: finally: # Mark the request as handled to ensure it is not processed again. await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index be68cc49..14868ad8 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,3 +1,4 @@ +import asyncio from urllib.parse import urljoin from playwright.async_api import async_playwright @@ -90,3 +91,7 @@ async def main() -> None: await page.close() # Mark the request as handled to ensure it is not processed again. await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index ffd1cc30..8cffe606 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -100,3 +100,7 @@ async def main() -> None: await request_queue.mark_request_as_handled(request) driver.quit() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py index 676ebab1..4d3a81d7 100644 --- a/docs/03_guides/code/05_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -1,7 +1,36 @@ +import asyncio + from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from apify import Actor +# Create a crawler. +crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + async def main() -> None: # Enter the context of the Actor. @@ -10,10 +39,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get( - 'start_urls', - [{'url': 'https://apify.com'}], - ) + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) ] # Exit if no start URLs are provided. @@ -21,33 +47,9 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a crawler. - crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. - # Remove or increase it for crawling all links. - max_requests_per_crawl=50, - ) - - # Define a request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: BeautifulSoupCrawlingContext) -> None: - url = context.request.url - Actor.log.info(f'Scraping {url}...') - - # Extract the desired data. - data = { - 'url': context.request.url, - 'title': context.soup.title.string if context.soup.title else None, - 'h1s': [h1.text for h1 in context.soup.find_all('h1')], - 'h2s': [h2.text for h2 in context.soup.find_all('h2')], - 'h3s': [h3.text for h3 in context.soup.find_all('h3')], - } - - # Store the extracted data to the default dataset. - await context.push_data(data) - - # Enqueue additional links found on the current page. - await context.enqueue_links() - # Run the crawler with the starting requests. await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index 71ac6c81..31f39d8b 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -1,7 +1,36 @@ +import asyncio + from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from apify import Actor +# Create a crawler. +crawler = ParselCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: ParselCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + 'h1s': context.selector.xpath('//h1/text()').getall(), + 'h2s': context.selector.xpath('//h2/text()').getall(), + 'h3s': context.selector.xpath('//h3/text()').getall(), + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + async def main() -> None: # Enter the context of the Actor. @@ -10,10 +39,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get( - 'start_urls', - [{'url': 'https://apify.com'}], - ) + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) ] # Exit if no start URLs are provided. @@ -21,33 +47,9 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a crawler. - crawler = ParselCrawler( - # Limit the crawl to max requests. - # Remove or increase it for crawling all links. - max_requests_per_crawl=50, - ) - - # Define a request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: ParselCrawlingContext) -> None: - url = context.request.url - Actor.log.info(f'Scraping {url}...') - - # Extract the desired data. - data = { - 'url': context.request.url, - 'title': context.selector.xpath('//title/text()').get(), - 'h1s': context.selector.xpath('//h1/text()').getall(), - 'h2s': context.selector.xpath('//h2/text()').getall(), - 'h3s': context.selector.xpath('//h3/text()').getall(), - } - - # Store the extracted data to the default dataset. - await context.push_data(data) - - # Enqueue additional links found on the current page. - await context.enqueue_links() - # Run the crawler with the starting requests. await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index 745fd92f..be4ea29e 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -1,7 +1,39 @@ +import asyncio + from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from apify import Actor +# Create a crawler. +crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, + # Run the browser in a headless mode. + headless=True, + browser_launch_options={'args': ['--disable-gpu']}, +) + + +# Define a request handler, which will be called for every request. +@crawler.router.default_handler +async def request_handler(context: PlaywrightCrawlingContext) -> None: + Actor.log.info(f'Scraping {context.request.url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()], + 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()], + 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links(strategy='same-domain') + async def main() -> None: # Enter the context of the Actor. @@ -10,10 +42,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get( - 'start_urls', - [{'url': 'https://apify.com'}], - ) + for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) ] # Exit if no start URLs are provided. @@ -21,46 +50,9 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a crawler. - crawler = PlaywrightCrawler( - # Limit the crawl to max requests. - # Remove or increase it for crawling all links. - max_requests_per_crawl=50, - headless=True, - browser_launch_options={ - 'args': ['--disable-gpu'], - }, - ) - - # Define a request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - url = context.request.url - Actor.log.info(f'Scraping {url}...') - - # Extract the desired data. - data = { - 'url': context.request.url, - 'title': await context.page.title(), - 'h1s': [ - await h1.text_content() - for h1 in await context.page.locator('h1').all() - ], - 'h2s': [ - await h2.text_content() - for h2 in await context.page.locator('h2').all() - ], - 'h3s': [ - await h3.text_content() - for h3 in await context.page.locator('h3').all() - ], - } - - # Store the extracted data to the default dataset. - await context.push_data(data) - - # Enqueue additional links found on the current page. - await context.enqueue_links() - # Run the crawler with the starting requests. await crawler.run(start_urls) + + +if __name__ == '__main__': + asyncio.run(main())