From 9c3581a07867f9e0f1a483065b390f2eeeee23df Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 22 Jan 2025 19:42:42 +0100 Subject: [PATCH 1/3] docs: Move code samples to files and other updates Closes: #250 --- CHANGELOG.md | 10 +- Makefile | 16 +- README.md | 8 +- docs/01-overview/01-introduction.mdx | 69 ----- docs/01-overview/03-structure.mdx | 52 ---- docs/01_overview/01_introduction.mdx | 59 ++++ .../02_running_actors_locally.mdx} | 23 +- docs/01_overview/03_actor_structure.mdx | 35 +++ docs/01_overview/code/01_introduction.py | 14 + .../code/actor_structure/__init__.py | 0 .../code/actor_structure/__main__.py | 6 + docs/01_overview/code/actor_structure/main.py | 8 + .../01_overview/code/actor_structure/py.typed | 0 docs/02-guides/01-requests-and-httpx.mdx | 100 ------ docs/02-guides/02-beautiful-soup.mdx | 81 ----- docs/02-guides/03-playwright.mdx | 120 -------- docs/02-guides/04-selenium.mdx | 110 ------- docs/02-guides/05-scrapy.mdx | 111 ------- docs/02_guides/01_beautifulsoup_httpx.mdx | 30 ++ docs/02_guides/02_crawlee.mdx | 37 +++ docs/02_guides/03_playwright.mdx | 56 ++++ docs/02_guides/04_selenium.mdx | 46 +++ docs/02_guides/05_scrapy.mdx | 96 ++++++ docs/02_guides/code/01_beautifulsoup_httpx.py | 85 ++++++ .../code/02_crawlee_beautifulsoup.py | 54 ++++ docs/02_guides/code/02_crawlee_playwright.py | 58 ++++ docs/02_guides/code/03_playwright.py | 94 ++++++ docs/02_guides/code/04_selenium.py | 104 +++++++ docs/02_guides/code/scrapy_src/__init__.py | 0 docs/02_guides/code/scrapy_src/__main__.py | 121 ++++++++ docs/02_guides/code/scrapy_src/items.py | 17 ++ docs/02_guides/code/scrapy_src/main.py | 60 ++++ docs/02_guides/code/scrapy_src/py.typed | 0 docs/02_guides/code/scrapy_src/settings.py | 15 + .../code/scrapy_src/spiders/__init__.py | 0 .../code/scrapy_src/spiders/py.typed | 0 .../code/scrapy_src/spiders/title.py | 53 ++++ docs/03-concepts/01-actor-lifecycle.mdx | 100 ------ docs/03-concepts/02-actor-input.mdx | 27 -- docs/03-concepts/03-storages.mdx | 284 ------------------ docs/03-concepts/05-proxy-management.mdx | 194 ------------ .../06-interacting-with-other-actors.mdx | 96 ------ docs/03-concepts/07-webhooks.mdx | 50 --- docs/03-concepts/08-access-apify-api.mdx | 42 --- docs/03-concepts/09-running-webserver.mdx | 66 ---- docs/03-concepts/11-configuration.mdx | 45 --- docs/03_concepts/01_actor_lifecycle.mdx | 55 ++++ docs/03_concepts/02_actor_input.mdx | 18 ++ docs/03_concepts/03_storages.mdx | 167 ++++++++++ .../04_actor_events.mdx} | 67 ++--- docs/03_concepts/05_proxy_management.mdx | 108 +++++++ .../06_interacting_with_other_actors.mdx | 51 ++++ docs/03_concepts/07_webhooks.mdx | 31 ++ docs/03_concepts/08_access_apify_api.mdx | 31 ++ docs/03_concepts/09_running_webserver.mdx | 26 ++ .../10_logging.mdx} | 61 ++-- docs/03_concepts/11_configuration.mdx | 32 ++ docs/03_concepts/code/01_context_manager.py | 9 + docs/03_concepts/code/01_init_exit.py | 16 + docs/03_concepts/code/01_reboot.py | 7 + docs/03_concepts/code/01_status_message.py | 14 + docs/03_concepts/code/02_input.py | 9 + docs/03_concepts/code/03_dataset_exports.py | 31 ++ .../03_concepts/code/03_dataset_read_write.py | 16 + docs/03_concepts/code/03_deleting_storages.py | 13 + docs/03_concepts/code/03_kvs_iterating.py | 18 ++ docs/03_concepts/code/03_kvs_public_url.py | 11 + docs/03_concepts/code/03_kvs_read_write.py | 25 ++ docs/03_concepts/code/03_opening_storages.py | 16 + docs/03_concepts/code/03_rq.py | 50 +++ docs/03_concepts/code/04_actor_events.py | 38 +++ docs/03_concepts/code/05_apify_proxy.py | 12 + .../03_concepts/code/05_apify_proxy_config.py | 15 + docs/03_concepts/code/05_custom_proxy.py | 17 ++ .../code/05_custom_proxy_function.py | 28 ++ docs/03_concepts/code/05_proxy_actor_input.py | 14 + docs/03_concepts/code/05_proxy_httpx.py | 22 ++ docs/03_concepts/code/05_proxy_rotation.py | 23 ++ docs/03_concepts/code/06_interacting_call.py | 22 ++ .../code/06_interacting_call_task.py | 19 ++ .../code/06_interacting_metamorph.py | 24 ++ docs/03_concepts/code/06_interacting_start.py | 13 + docs/03_concepts/code/07_webhook.py | 16 + .../03_concepts/code/07_webhook_preventing.py | 17 ++ docs/03_concepts/code/08_actor_client.py | 11 + docs/03_concepts/code/08_actor_new_client.py | 14 + docs/03_concepts/code/09_webserver.py | 47 +++ docs/03_concepts/code/10_log_config.py | 12 + docs/03_concepts/code/10_logger_usage.py | 23 ++ docs/03_concepts/code/11_config.py | 16 + .../upgrading_to_v2.md | 0 pyproject.toml | 17 +- tests/unit/conftest.py | 4 +- website/docusaurus.config.js | 4 + website/generate_module_shortcuts.py | 21 +- website/sidebars.js | 10 +- website/src/pages/home_page_example.py | 14 + website/src/pages/index.js | 21 +- 98 files changed, 2320 insertions(+), 1708 deletions(-) delete mode 100644 docs/01-overview/01-introduction.mdx delete mode 100644 docs/01-overview/03-structure.mdx create mode 100644 docs/01_overview/01_introduction.mdx rename docs/{01-overview/02-running-locally.mdx => 01_overview/02_running_actors_locally.mdx} (55%) create mode 100644 docs/01_overview/03_actor_structure.mdx create mode 100644 docs/01_overview/code/01_introduction.py create mode 100644 docs/01_overview/code/actor_structure/__init__.py create mode 100644 docs/01_overview/code/actor_structure/__main__.py create mode 100644 docs/01_overview/code/actor_structure/main.py create mode 100644 docs/01_overview/code/actor_structure/py.typed delete mode 100644 docs/02-guides/01-requests-and-httpx.mdx delete mode 100644 docs/02-guides/02-beautiful-soup.mdx delete mode 100644 docs/02-guides/03-playwright.mdx delete mode 100644 docs/02-guides/04-selenium.mdx delete mode 100644 docs/02-guides/05-scrapy.mdx create mode 100644 docs/02_guides/01_beautifulsoup_httpx.mdx create mode 100644 docs/02_guides/02_crawlee.mdx create mode 100644 docs/02_guides/03_playwright.mdx create mode 100644 docs/02_guides/04_selenium.mdx create mode 100644 docs/02_guides/05_scrapy.mdx create mode 100644 docs/02_guides/code/01_beautifulsoup_httpx.py create mode 100644 docs/02_guides/code/02_crawlee_beautifulsoup.py create mode 100644 docs/02_guides/code/02_crawlee_playwright.py create mode 100644 docs/02_guides/code/03_playwright.py create mode 100644 docs/02_guides/code/04_selenium.py create mode 100644 docs/02_guides/code/scrapy_src/__init__.py create mode 100644 docs/02_guides/code/scrapy_src/__main__.py create mode 100644 docs/02_guides/code/scrapy_src/items.py create mode 100644 docs/02_guides/code/scrapy_src/main.py create mode 100644 docs/02_guides/code/scrapy_src/py.typed create mode 100644 docs/02_guides/code/scrapy_src/settings.py create mode 100644 docs/02_guides/code/scrapy_src/spiders/__init__.py create mode 100644 docs/02_guides/code/scrapy_src/spiders/py.typed create mode 100644 docs/02_guides/code/scrapy_src/spiders/title.py delete mode 100644 docs/03-concepts/01-actor-lifecycle.mdx delete mode 100644 docs/03-concepts/02-actor-input.mdx delete mode 100644 docs/03-concepts/03-storages.mdx delete mode 100644 docs/03-concepts/05-proxy-management.mdx delete mode 100644 docs/03-concepts/06-interacting-with-other-actors.mdx delete mode 100644 docs/03-concepts/07-webhooks.mdx delete mode 100644 docs/03-concepts/08-access-apify-api.mdx delete mode 100644 docs/03-concepts/09-running-webserver.mdx delete mode 100644 docs/03-concepts/11-configuration.mdx create mode 100644 docs/03_concepts/01_actor_lifecycle.mdx create mode 100644 docs/03_concepts/02_actor_input.mdx create mode 100644 docs/03_concepts/03_storages.mdx rename docs/{03-concepts/04-actor-events.mdx => 03_concepts/04_actor_events.mdx} (57%) create mode 100644 docs/03_concepts/05_proxy_management.mdx create mode 100644 docs/03_concepts/06_interacting_with_other_actors.mdx create mode 100644 docs/03_concepts/07_webhooks.mdx create mode 100644 docs/03_concepts/08_access_apify_api.mdx create mode 100644 docs/03_concepts/09_running_webserver.mdx rename docs/{03-concepts/10-logging.mdx => 03_concepts/10_logging.mdx} (52%) create mode 100644 docs/03_concepts/11_configuration.mdx create mode 100644 docs/03_concepts/code/01_context_manager.py create mode 100644 docs/03_concepts/code/01_init_exit.py create mode 100644 docs/03_concepts/code/01_reboot.py create mode 100644 docs/03_concepts/code/01_status_message.py create mode 100644 docs/03_concepts/code/02_input.py create mode 100644 docs/03_concepts/code/03_dataset_exports.py create mode 100644 docs/03_concepts/code/03_dataset_read_write.py create mode 100644 docs/03_concepts/code/03_deleting_storages.py create mode 100644 docs/03_concepts/code/03_kvs_iterating.py create mode 100644 docs/03_concepts/code/03_kvs_public_url.py create mode 100644 docs/03_concepts/code/03_kvs_read_write.py create mode 100644 docs/03_concepts/code/03_opening_storages.py create mode 100644 docs/03_concepts/code/03_rq.py create mode 100644 docs/03_concepts/code/04_actor_events.py create mode 100644 docs/03_concepts/code/05_apify_proxy.py create mode 100644 docs/03_concepts/code/05_apify_proxy_config.py create mode 100644 docs/03_concepts/code/05_custom_proxy.py create mode 100644 docs/03_concepts/code/05_custom_proxy_function.py create mode 100644 docs/03_concepts/code/05_proxy_actor_input.py create mode 100644 docs/03_concepts/code/05_proxy_httpx.py create mode 100644 docs/03_concepts/code/05_proxy_rotation.py create mode 100644 docs/03_concepts/code/06_interacting_call.py create mode 100644 docs/03_concepts/code/06_interacting_call_task.py create mode 100644 docs/03_concepts/code/06_interacting_metamorph.py create mode 100644 docs/03_concepts/code/06_interacting_start.py create mode 100644 docs/03_concepts/code/07_webhook.py create mode 100644 docs/03_concepts/code/07_webhook_preventing.py create mode 100644 docs/03_concepts/code/08_actor_client.py create mode 100644 docs/03_concepts/code/08_actor_new_client.py create mode 100644 docs/03_concepts/code/09_webserver.py create mode 100644 docs/03_concepts/code/10_log_config.py create mode 100644 docs/03_concepts/code/10_logger_usage.py create mode 100644 docs/03_concepts/code/11_config.py rename docs/{04-upgrading => 04_upgrading}/upgrading_to_v2.md (100%) create mode 100644 website/src/pages/home_page_example.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b3b0d05..cd440e79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,7 +81,7 @@ All notable changes to this project will be documented in this file. ### 🚀 Features -- Add actor standby port ([#220](https://github.com/apify/apify-sdk-python/pull/220)) ([6d0d87d](https://github.com/apify/apify-sdk-python/commit/6d0d87dcaedaf42d8eeb7d23c56f6b102434cbcb)) by [@jirimoravcik](https://github.com/jirimoravcik) +- Add Actor standby port ([#220](https://github.com/apify/apify-sdk-python/pull/220)) ([6d0d87d](https://github.com/apify/apify-sdk-python/commit/6d0d87dcaedaf42d8eeb7d23c56f6b102434cbcb)) by [@jirimoravcik](https://github.com/jirimoravcik) ## [1.7.1](https://github.com/apify/apify-sdk-python/releases/tag/v1.7.1) (2024-05-23) @@ -122,12 +122,12 @@ All notable changes to this project will be documented in this file. - Add test for get_env and is_at_home ([#29](https://github.com/apify/apify-sdk-python/pull/29)) ([cc45afb](https://github.com/apify/apify-sdk-python/commit/cc45afbf848db3626054c599cb3a5a2972a48748)) by [@drobnikj](https://github.com/drobnikj) - Updating pull request toolkit config [INTERNAL] ([387143c](https://github.com/apify/apify-sdk-python/commit/387143ccf2c32a99c95e9931e5649e558d35daeb)) by [@mtrunkat](https://github.com/mtrunkat) - Add documentation for `StorageManager` and `StorageClientManager`, open_* methods in `Actor` ([#34](https://github.com/apify/apify-sdk-python/pull/34)) ([3f6b942](https://github.com/apify/apify-sdk-python/commit/3f6b9426dc03fea40d80af2e4c8f04ecf2620e8a)) by [@jirimoravcik](https://github.com/jirimoravcik) -- Add tests for actor lifecycle ([#35](https://github.com/apify/apify-sdk-python/pull/35)) ([4674728](https://github.com/apify/apify-sdk-python/commit/4674728905be5076283ff3795332866e8bef6ee8)) by [@drobnikj](https://github.com/drobnikj) +- Add tests for Actor lifecycle ([#35](https://github.com/apify/apify-sdk-python/pull/35)) ([4674728](https://github.com/apify/apify-sdk-python/commit/4674728905be5076283ff3795332866e8bef6ee8)) by [@drobnikj](https://github.com/drobnikj) - Add docs for `Dataset`, `KeyValueStore`, and `RequestQueue` ([#37](https://github.com/apify/apify-sdk-python/pull/37)) ([174548e](https://github.com/apify/apify-sdk-python/commit/174548e952b47ee519d1a05c0821a2c42c2fddf6)) by [@jirimoravcik](https://github.com/jirimoravcik) - Docs string for memory storage clients ([#31](https://github.com/apify/apify-sdk-python/pull/31)) ([8f55d46](https://github.com/apify/apify-sdk-python/commit/8f55d463394307b004193efc43b67b44d030f6de)) by [@drobnikj](https://github.com/drobnikj) -- Add test for storage actor methods ([#39](https://github.com/apify/apify-sdk-python/pull/39)) ([b89bbcf](https://github.com/apify/apify-sdk-python/commit/b89bbcfdcae4f436a68e92f1f60628aea1036dde)) by [@drobnikj](https://github.com/drobnikj) +- Add test for storage Actor methods ([#39](https://github.com/apify/apify-sdk-python/pull/39)) ([b89bbcf](https://github.com/apify/apify-sdk-python/commit/b89bbcfdcae4f436a68e92f1f60628aea1036dde)) by [@drobnikj](https://github.com/drobnikj) - Various fixes and improvements ([#41](https://github.com/apify/apify-sdk-python/pull/41)) ([5bae238](https://github.com/apify/apify-sdk-python/commit/5bae238821b3b63c73d0cbadf4b478511cb045d2)) by [@jirimoravcik](https://github.com/jirimoravcik) -- Add the rest unit tests for actor ([#40](https://github.com/apify/apify-sdk-python/pull/40)) ([72d92ea](https://github.com/apify/apify-sdk-python/commit/72d92ea080670ceecc234c149058d2ebe763e3a8)) by [@drobnikj](https://github.com/drobnikj) +- Add the rest unit tests for Actor ([#40](https://github.com/apify/apify-sdk-python/pull/40)) ([72d92ea](https://github.com/apify/apify-sdk-python/commit/72d92ea080670ceecc234c149058d2ebe763e3a8)) by [@drobnikj](https://github.com/drobnikj) - Decrypt input secrets if there are some ([#45](https://github.com/apify/apify-sdk-python/pull/45)) ([6eb1630](https://github.com/apify/apify-sdk-python/commit/6eb163077341218a3f9dcf566986d7464f6ab09e)) by [@drobnikj](https://github.com/drobnikj) - Add a few integration tests ([#48](https://github.com/apify/apify-sdk-python/pull/48)) ([1843f48](https://github.com/apify/apify-sdk-python/commit/1843f48845e724e1c2682b8d09a6b5c48c57d9ec)) by [@drobnikj](https://github.com/drobnikj) - Add integration tests for storages, proxy configuration ([#49](https://github.com/apify/apify-sdk-python/pull/49)) ([fd0566e](https://github.com/apify/apify-sdk-python/commit/fd0566ed3b8c85c7884f8bba3cf7394215fabed0)) by [@jirimoravcik](https://github.com/jirimoravcik) @@ -139,4 +139,4 @@ All notable changes to this project will be documented in this file. - Key error for storage name ([#28](https://github.com/apify/apify-sdk-python/pull/28)) ([83b30a9](https://github.com/apify/apify-sdk-python/commit/83b30a90df4d3b173302f1c6006b346091fced60)) by [@drobnikj](https://github.com/drobnikj) - \ No newline at end of file + diff --git a/Makefile b/Makefile index 78e0cc0e..d77cbc9b 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,6 @@ .PHONY: clean install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov \ integration-tests format check-code build-api-reference build-docs run-docs -DIRS_WITH_CODE = src tests - # This is default for local testing, but GitHub workflows override it to a higher value in CI INTEGRATION_TESTS_CONCURRENCY = 1 @@ -22,11 +20,11 @@ publish-to-pypi: poetry publish --no-interaction -vv lint: - poetry run ruff format --check $(DIRS_WITH_CODE) - poetry run ruff check $(DIRS_WITH_CODE) + poetry run ruff format --check + poetry run ruff check type-check: - poetry run mypy $(DIRS_WITH_CODE) + poetry run mypy unit-tests: poetry run pytest --numprocesses=auto --verbose --cov=src/apify tests/unit @@ -38,8 +36,8 @@ integration-tests: poetry run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) --verbose tests/integration format: - poetry run ruff check --fix $(DIRS_WITH_CODE) - poetry run ruff format $(DIRS_WITH_CODE) + poetry run ruff check --fix + poetry run ruff format # The check-code target runs a series of checks equivalent to those performed by pre-commit hooks # and the run_checks.yaml GitHub Actions workflow. @@ -49,7 +47,7 @@ build-api-reference: cd website && poetry run ./build_api_reference.sh build-docs: - cd website && npm clean-install && poetry run npm run build + cd website && npm clean-install && npm run build run-docs: build-api-reference - cd website && npm clean-install && poetry run npm run start + cd website && npm clean-install && npm run start diff --git a/README.md b/README.md index 6c2ec79f..5555f97f 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,11 @@ Below are few examples demonstrating how to use the Apify SDK with some web scra This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages. ```python -from apify import Actor from bs4 import BeautifulSoup from httpx import AsyncClient +from apify import Actor + async def main() -> None: async with Actor: @@ -84,8 +85,9 @@ async def main() -> None: This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping. ```python -from apify import Actor, Request -from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor async def main() -> None: diff --git a/docs/01-overview/01-introduction.mdx b/docs/01-overview/01-introduction.mdx deleted file mode 100644 index 624c46f0..00000000 --- a/docs/01-overview/01-introduction.mdx +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: Introduction -sidebar_label: Introduction ---- - -The Apify SDK for Python is the official library for creating [Apify Actors](https://docs.apify.com/platform/actors) in Python. - -```python -from apify import Actor -from bs4 import BeautifulSoup -import requests - -async def main(): - async with Actor: - actor_input = await Actor.get_input() - response = requests.get(actor_input['url']) - soup = BeautifulSoup(response.content, 'html.parser') - await Actor.push_data({ 'url': actor_input['url'], 'title': soup.title.string }) -``` - -## What are Actors? - -Actors are serverless cloud programs that can do almost anything a human can do in a web browser. -They can do anything from small tasks such as filling in forms or unsubscribing from online services, -all the way up to scraping and processing vast numbers of web pages. - -Actors can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/), -where you can run them at scale, monitor them, schedule them, and even publish and monetize them. - -If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about) in the Apify platform documentation. - -## Quick start - -### Creating Actors - -To create and run Actors through Apify Console, -see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template). - -To create and run Python Actors locally, check the documentation for [how to create and run Python Actors locally](./running-locally). - -### Guides - -To see how you can integrate the Apify SDK with some of the most popular web scraping libraries, -see our guides for working with [Requests or HTTPX](../guides/requests-and-httpx), -[Beautiful Soup](../guides/beautiful-soup), -[Playwright](../guides/playwright), -[Selenium](../guides/selenium), -or [Scrapy](../guides/scrapy). - -### Usage concepts - -To learn more about the features of the Apify SDK and how to use them, -check out the Usage Concepts section in the sidebar, -especially the guides for the [Actor lifecycle](../concepts/actor-lifecycle), -[working with storages](../concepts/storages), -[handling Actor events](../concepts/actor-events), -and [how to use proxies](../concepts/proxy-management). - -## Installing the Apify SDK separately - -When you create an Actor using the Apify CLI, the Apify SDK for Python is installed for you automatically. -If you want to install it separately, you can install it from its [PyPI listing](https://github.com/apify/apify-sdk-python): - -```bash -pip install apify -``` - -If you are not developing Apify Actors and you just need to access the Apify API from Python, -consider using the [Apify API client for Python](https://docs.apify.com/api/client/python) directly. diff --git a/docs/01-overview/03-structure.mdx b/docs/01-overview/03-structure.mdx deleted file mode 100644 index d4e2053c..00000000 --- a/docs/01-overview/03-structure.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: Actor structure -sidebar_label: Actor structure ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import CodeBlock from '@theme/CodeBlock'; - -All Python Actor templates follow the same structure. - -The `.actor` directory contains the [Actor configuration](https://docs.apify.com/platform/actors/development/actor-config), -such as the Actor's definition and input schema, and the Dockerfile necessary to run the Actor on the Apify platform. - -The Actor's runtime dependencies are specified in the `requirements.txt` file, -which follows the [standard requirements file format](https://pip.pypa.io/en/stable/reference/requirements-file-format/). - -The Actor's source code is in the `src` folder. This folder contains two important files: -`main.py`, which contains the main function of the Actor, -and `__main__.py`, which is the entrypoint of the Actor package, -setting up the Actor [logger](../concepts/logging) -and executing the Actor's main function via [`asyncio.run()`](https://docs.python.org/3/library/asyncio-runner.html#asyncio.run). - - - - { -`from apify import Actor -${''} -async def main(): - async with Actor: - Actor.log.info('Actor input:', await Actor.get_input()) - await Actor.set_value('OUTPUT', 'Hello, world!')` - } - - - { -`import asyncio -import logging -${''} -from apify.log import ActorLogFormatter -${''} -from .main import main -${''} -asyncio.run(main())` - } - - - -If you want to modify the Actor structure, -you need to make sure that your Actor is executable as a module, via `python -m src`, -as that is the command started by `apify run` in the Apify CLI. -We recommend keeping the entrypoint for the Actor in the `src/__main__.py` file. diff --git a/docs/01_overview/01_introduction.mdx b/docs/01_overview/01_introduction.mdx new file mode 100644 index 00000000..f8ebcedd --- /dev/null +++ b/docs/01_overview/01_introduction.mdx @@ -0,0 +1,59 @@ +--- +title: Introduction +sidebar_label: Introduction +--- + +import CodeBlock from '@theme/CodeBlock'; + +import IntroductionExample from '!!raw-loader!./code/01_introduction.py'; + +The Apify SDK for Python is the official library for creating [Apify Actors](https://docs.apify.com/platform/actors) using Python. + + + {IntroductionExample} + + +## What are Actors? + +Actors are serverless cloud programs capable of performing tasks in a web browser, similar to what a human can do. These tasks can range from simple operations, such as filling out forms or unsubscribing from services, to complex jobs like scraping and processing large numbers of web pages. + +Actors can be executed locally or on the [Apify platform](https://docs.apify.com/platform/), which provides features for running them at scale, monitoring, scheduling, and even publishing and monetizing them. + +If you're new to Apify, refer to the Apify platform documentation to learn [what Apify is](https://docs.apify.com/platform/about). + +## Quick Start + +This section provides a quick start guide for creating and running Actors. + +### Creating Actors + +To create and run Actors using the Apify Console, see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template). + +For creating and running Python Actors locally, refer to the documentation for [creating and running Python Actors locally](./running_locally). + +### Guides + +Integrate the Apify SDK with popular web scraping libraries by following these guides: +- [Requests or HTTPX](../guides/requests_and_httpx) +- [Beautiful Soup](../guides/beautiful_soup) +- [Playwright](../guides/playwright) +- [Selenium](../guides/selenium) +- [Scrapy](../guides/scrapy) + +### Usage Concepts + +For a deeper understanding of the Apify SDK's features, refer to the **Usage concepts** section in the sidebar. Key topics include: +- [Actor lifecycle](../concepts/actor-lifecycle) +- [Working with storages](../concepts/storages) +- [Handling Actor events](../concepts/actor-events) +- [Using proxies](../concepts/proxy-management) + +## Installing the Apify SDK Separately + +When creating an Actor using the Apify CLI, the Apify SDK for Python is installed automatically. If you want to install it independently, use the following command: + +```bash +pip install apify +``` + +If your goal is not to develop Apify Actors but to interact with the Apify API from Python, consider using the [Apify API client for Python](https://docs.apify.com/api/client/python) directly. diff --git a/docs/01-overview/02-running-locally.mdx b/docs/01_overview/02_running_actors_locally.mdx similarity index 55% rename from docs/01-overview/02-running-locally.mdx rename to docs/01_overview/02_running_actors_locally.mdx index 45e39c04..a8512a6b 100644 --- a/docs/01-overview/02-running-locally.mdx +++ b/docs/01_overview/02_running_actors_locally.mdx @@ -1,5 +1,5 @@ --- -title: Running Python Actors locally +title: Running Actor locally sidebar_label: Running Actors locally --- @@ -7,39 +7,34 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import CodeBlock from '@theme/CodeBlock'; +In this page, you'll learn how to create and run Apify Actors locally on your computer. + ## Requirements -The Apify SDK requires Python version 3.8 or above to run Python actors locally. +The Apify SDK requires Python version 3.9 or above to run Python Actors locally. ## Creating your first Actor -To create a new Apify Actor on your computer, you can use the [Apify CLI](https://docs.apify.com/cli), -and select one of the [Python Actor templates](https://apify.com/templates?category=python). +To create a new Apify Actor on your computer, you can use the [Apify CLI](https://docs.apify.com/cli), and select one of the [Python Actor templates](https://apify.com/templates/categories/python). -For example, to create an Actor from the "[beta] Python SDK" template, -you can use the [`apify create` command](https://docs.apify.com/cli/docs/reference#apify-create-actorname). +For example, to create an Actor from the Python SDK template, you can use the [`apify create`](https://docs.apify.com/cli/docs/reference#apify-create-actorname) command. ```bash apify create my-first-actor --template python-start ``` -This will create a new folder called `my-first-actor`, -download and extract the "Getting started with Python" Actor template there, -create a virtual environment in `my-first-actor/.venv`, -and install the Actor dependencies in it. +This will create a new folder called `my-first-actor`, download and extract the "Getting started with Python" Actor template there, create a virtual environment in `my-first-actor/.venv`, and install the Actor dependencies in it. ## Running the Actor -To run the Actor, you can use the [`apify run` command](https://docs.apify.com/cli/docs/reference#apify-run): +To run the Actor, you can use the [`apify run`](https://docs.apify.com/cli/docs/reference#apify-run) command: ```bash cd my-first-actor apify run ``` -This will activate the virtual environment in `.venv` (if no other virtual environment is activated yet), -then start the Actor, passing the right environment variables for local running, -and configure it to use local storages from the `storage` folder. +This will activate the virtual environment in `.venv` (if no other virtual environment is activated yet), then start the Actor, passing the right environment variables for local running, and configure it to use local storages from the `storage` folder. The Actor input, for example, will be in `storage/key_value_stores/default/INPUT.json`. diff --git a/docs/01_overview/03_actor_structure.mdx b/docs/01_overview/03_actor_structure.mdx new file mode 100644 index 00000000..71ece118 --- /dev/null +++ b/docs/01_overview/03_actor_structure.mdx @@ -0,0 +1,35 @@ +--- +title: Actor structure +sidebar_label: Actor structure +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import UnderscoreMainExample from '!!raw-loader!./code/actor_structure/main.py'; +import MainExample from '!!raw-loader!./code/actor_structure/__main__.py'; + +All Python Actor templates follow the same structure. + +The `.actor/` directory contains the [Actor configuration](https://docs.apify.com/platform/actors/development/actor-config), such as the Actor's definition and input schema, and the Dockerfile necessary to run the Actor on the Apify platform. + +The Actor's runtime dependencies are specified in the `requirements.txt` file, +which follows the [standard requirements file format](https://pip.pypa.io/en/stable/reference/requirements-file-format/). + +The Actor's source code is in the `src/` folder. This folder contains two important files: `main.py`, which contains the main function of the Actor, and `__main__.py`, which is the entrypoint of the Actor package, setting up the Actor [logger](../concepts/logging) and executing the Actor's main function via [`asyncio.run()`](https://docs.python.org/3/library/asyncio-runner.html#asyncio.run). + + + + + {MainExample} + + + + + {UnderscoreMainExample} + + + + +If you want to modify the Actor structure, you need to make sure that your Actor is executable as a module, via `python -m src`, as that is the command started by `apify run` in the Apify CLI. We recommend keeping the entrypoint for the Actor in the `src/__main__.py` file. diff --git a/docs/01_overview/code/01_introduction.py b/docs/01_overview/code/01_introduction.py new file mode 100644 index 00000000..c5441d61 --- /dev/null +++ b/docs/01_overview/code/01_introduction.py @@ -0,0 +1,14 @@ +import httpx +from bs4 import BeautifulSoup + +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() + async with httpx.AsyncClient() as client: + response = await client.get(actor_input['url']) + soup = BeautifulSoup(response.content, 'html.parser') + data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None} + await Actor.push_data(data) diff --git a/docs/01_overview/code/actor_structure/__init__.py b/docs/01_overview/code/actor_structure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/01_overview/code/actor_structure/__main__.py b/docs/01_overview/code/actor_structure/__main__.py new file mode 100644 index 00000000..8c4ab0b8 --- /dev/null +++ b/docs/01_overview/code/actor_structure/__main__.py @@ -0,0 +1,6 @@ +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/01_overview/code/actor_structure/main.py b/docs/01_overview/code/actor_structure/main.py new file mode 100644 index 00000000..1fb4b610 --- /dev/null +++ b/docs/01_overview/code/actor_structure/main.py @@ -0,0 +1,8 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() + Actor.log.info('Actor input: %s', actor_input) + await Actor.set_value('OUTPUT', 'Hello, world!') diff --git a/docs/01_overview/code/actor_structure/py.typed b/docs/01_overview/code/actor_structure/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/docs/02-guides/01-requests-and-httpx.mdx b/docs/02-guides/01-requests-and-httpx.mdx deleted file mode 100644 index ea8a93af..00000000 --- a/docs/02-guides/01-requests-and-httpx.mdx +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: Using Requests and HTTPX -sidebar_label: Using Requests and HTTPX ---- - -To use either of the libraries mentioned below in your Actors, -you can start from the [Start with Python](https://apify.com/templates?category=python) Actor template. - -## Requests - -The [`requests`](https://requests.readthedocs.io) library is one of the most popular Python libraries for making HTTP requests. - -To use it in your Actors, no special configuration is needed. -Just put `requests` in your `requirements.txt` file, -[reinstall dependencies](../overview/running-locally#adding-dependencies) if you're running the Actor locally, -and you're good to go. - -```python title="src/main.py" -import requests -from apify import Actor - -async def main(): - async with Actor: - response = requests.get('http://example.com') - print(response.text) -``` - -### Using proxies with requests - -To use Apify Proxy with `requests`, -you can just generate a proxy URL through [`Actor.create_proxy_configuration()`](../../reference/class/Actor#create_proxy_configuration), -and pass it to `requests` using the [`proxies` argument](https://requests.readthedocs.io/en/latest/user/advanced/#proxies): - -```python title="src/main.py" -import requests -from apify import Actor - -async def main(): - async with Actor: - proxy_configuration = await Actor.create_proxy_configuration() - proxy_url = await proxy_configuration.new_url() - proxies = { - 'http': proxy_url, - 'https': proxy_url, - } - - response = requests.get('http://example.com', proxies=proxies) - print(response.text) -``` - -To learn more about using proxies in your Actor with `requests`, check the [documentation for proxy management](../concepts/proxy-management). - -## HTTPX - -Another very popular Python library for performing HTTP requests is [`HTTPX`](https://www.python-httpx.org/). -Its main advantage over `requests` is the ability to [perform asynchronous HTTP requests](https://www.python-httpx.org/async/), -making it ideal for large-scale, parallel web scraping. - -To use it in your Actors, no special configuration is needed. -Just put `httpx` in your `requirements.txt` file, -[reinstall dependencies](../overview/running-locally#adding-dependencies) if you're running the Actor locally, -and you're good to go. - -```python title="src/main.py" -import asyncio -import httpx -from apify import Actor - -async def main(): - async with Actor: - async with httpx.AsyncClient() as httpx_client: - # This will perform all the requests in parallel - http_requests = [] - for i in range(10): - http_requests.append(httpx_client.get(f'http://example.com/{i}')) - - responses = await asyncio.gather(*http_requests) - print(responses) -``` - -### Using proxies with HTTPX - -To use Apify Proxy with `httpx`, -you can just generate a proxy URL through [`Actor.create_proxy_configuration()`](../../reference/class/Actor#create_proxy_configuration), -and pass it to `httpx` using the [`proxies` argument](https://requests.readthedocs.io/en/latest/user/advanced/#proxies): - -```python title="src/main.py" -import httpx -from apify import Actor - -async def main(): - async with Actor: - proxy_configuration = await Actor.create_proxy_configuration() - proxy_url = await proxy_configuration.new_url() - async with httpx.AsyncClient(proxy=proxy_url) as httpx_client: - response = httpx_client.get(f'http://example.com'), - print(response) -``` - -To learn more about using proxies in your Actor with `httpx`, check the [documentation for proxy management](../concepts/proxy-management). diff --git a/docs/02-guides/02-beautiful-soup.mdx b/docs/02-guides/02-beautiful-soup.mdx deleted file mode 100644 index a7ebdc84..00000000 --- a/docs/02-guides/02-beautiful-soup.mdx +++ /dev/null @@ -1,81 +0,0 @@ ---- -title: Using Beautiful Soup -sidebar_label: Using Beautiful Soup ---- - -[Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) is a Python library for pulling data out of HTML and XML files. -It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, -allowing you to quickly extract the data you need. - -## Using BeautifulSoup in Actors - -To create Actors which use BeautifulSoup, start from the [BeautifulSoup & Python](https://apify.com/templates?category=python) Actor template. - -This Actor template already contains the BeautifulSoup library preinstalled, which means you can start using it right away. - -## Example Actor - -This is a simple Actor that recursively scrapes titles from all linked websites, -up to a maximum depth, starting from URLs in the Actor input. - -It uses `requests` to fetch the pages, -and BeautifulSoup to parse their content and read the page title and links to other pages. - -```python title="src/main.py" -from urllib.parse import urljoin - -import requests -from apify import Actor -from bs4 import BeautifulSoup - -async def main(): - async with Actor: - # Read the Actor input - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) - max_depth = actor_input.get('max_depth', 1) - - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Enqueue the starting URLs in the default request queue - default_queue = await Actor.open_request_queue() - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url}...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) - - # Process the requests in the queue one by one - while request := await default_queue.fetch_next_request(): - url = request['url'] - depth = request['userData']['depth'] - Actor.log.info(f'Scraping {url}...') - - try: - # Fetch the URL using `requests` and parse it using `BeautifulSoup` - response = requests.get(url) - soup = BeautifulSoup(response.content, 'html.parser') - - # If we haven't reached the max depth, - # look for nested links and enqueue their targets - if depth < max_depth: - for link in soup.find_all('a'): - link_href = link.get('href') - link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url}...') - await default_queue.add_request({ - 'url': link_url, - 'userData': {'depth': depth + 1 }, - }) - - # Push the title of the page into the default dataset - title = soup.title.string if soup.title else None - await Actor.push_data({ 'url': url, 'title': title }) - except: - Actor.log.exception(f'Cannot extract data from {url}.') - finally: - # Mark the request as handled so it's not processed again - await default_queue.mark_request_as_handled(request) -``` diff --git a/docs/02-guides/03-playwright.mdx b/docs/02-guides/03-playwright.mdx deleted file mode 100644 index a46f578f..00000000 --- a/docs/02-guides/03-playwright.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: Using Playwright -sidebar_label: Using Playwright ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import CodeBlock from '@theme/CodeBlock'; - -[Playwright](https://playwright.dev) is a tool for web automation and testing that can also be used for web scraping. -It allows you to control a web browser programmatically and interact with web pages just as a human would. - -Some of the key features of Playwright for web scraping include: - -- **Cross-browser support** - Playwright supports the latest versions of major browsers like Chrome, Firefox, and Safari, -so you can choose the one that suits your needs the best. -- **Headless mode** - Playwright can run in headless mode, -meaning that the browser window is not visible on your screen while it is scraping, -which can be useful for running scraping tasks in the background or in containers without a display. -- **Powerful selectors** - Playwright provides a variety of powerful selectors that allow you to target specific elements on a web page, -including CSS selectors, XPath, and text matching. -- **Emulation of user interactions** - Playwright allows you to emulate user interactions like clicking, scrolling, filling out forms, -and even typing in text, which can be useful for scraping websites that have dynamic content or require user input. - -## Using Playwright in Actors - -To create Actors which use Playwright, start from the [Playwright & Python](https://apify.com/templates?category=python) Actor template. - -On the Apify platform, the Actor will already have Playwright and the necessary browsers preinstalled in its Docker image, -including the tools and setup necessary to run browsers in headful mode. - -When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the Actor. - - - - { -`source .venv/bin/activate -playwright install --with-deps` - } - - - { -`.venv\\Scripts\\activate -playwright install --with-deps` - } - - - -## Example Actor - -This is a simple Actor that recursively scrapes titles from all linked websites, -up to a maximum depth, starting from URLs in the Actor input. - -It uses Playwright to open the pages in an automated Chrome browser, -and to extract the title and anchor elements after the pages load. - -```python title="src/main.py" -from urllib.parse import urljoin - -from apify import Actor -from playwright.async_api import async_playwright - - -async def main(): - async with Actor: - # Read the Actor input - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) - max_depth = actor_input.get('max_depth', 1) - - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Enqueue the starting URLs in the default request queue - default_queue = await Actor.open_request_queue() - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) - - # Launch Playwright an open a new browser context - Actor.log.info('Launching Playwright...') - async with async_playwright() as playwright: - browser = await playwright.chromium.launch(headless=Actor.config.headless) - context = await browser.new_context() - - # Process the requests in the queue one by one - while request := await default_queue.fetch_next_request(): - url = request['url'] - depth = request['userData']['depth'] - Actor.log.info(f'Scraping {url} ...') - - try: - # Open the URL in a new Playwright page - page = await context.new_page() - await page.goto(url) - - # If we haven't reached the max depth, - # look for nested links and enqueue their targets - if depth < max_depth: - for link in await page.locator('a').all(): - link_href = await link.get_attribute('href') - link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - await default_queue.add_request({ - 'url': link_url, - 'userData': {'depth': depth + 1 }, - }) - - # Push the title of the page into the default dataset - title = await page.title() - await Actor.push_data({ 'url': url, 'title': title }) - except: - Actor.log.exception(f'Cannot extract data from {url}.') - finally: - await page.close() - await default_queue.mark_request_as_handled(request) -``` diff --git a/docs/02-guides/04-selenium.mdx b/docs/02-guides/04-selenium.mdx deleted file mode 100644 index 3fb77d7c..00000000 --- a/docs/02-guides/04-selenium.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: Using Selenium -sidebar_label: Using Selenium ---- - -[Selenium](https://www.selenium.dev/) is a tool for web automation and testing that can also be used for web scraping. -It allows you to control a web browser programmatically and interact with web pages just as a human would. - -Some of the key features of Selenium for web scraping include: - -- **Cross-browser support** - Selenium supports the latest versions of major browsers like Chrome, Firefox, and Safari, -so you can choose the one that suits your needs the best. -- **Headless mode** - Selenium can run in headless mode, -meaning that the browser window is not visible on your screen while it is scraping, -which can be useful for running scraping tasks in the background or in containers without a display. -- **Powerful selectors** - Selenium provides a variety of powerful selectors that allow you to target specific elements on a web page, -including CSS selectors, XPath, and text matching. -- **Emulation of user interactions** - Selenium allows you to emulate user interactions like clicking, scrolling, filling out forms, -and even typing in text, which can be useful for scraping websites that have dynamic content or require user input. - -## Using Selenium in Actors - -To create Actors which use Selenium, start from the [Selenium & Python](https://apify.com/templates?category=python) Actor template. - -On the Apify platform, the Actor will already have Selenium and the necessary browsers preinstalled in its Docker image, -including the tools and setup necessary to run browsers in headful mode. - -When running the Actor locally, you'll need to install the Selenium browser drivers yourself. -Refer to the [Selenium documentation](https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/) for installation instructions. - -## Example Actor - -This is a simple Actor that recursively scrapes titles from all linked websites, -up to a maximum depth, starting from URLs in the Actor input. - -It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, -and to extract the title and anchor elements after the pages load. - -```python title="src/main.py" -from urllib.parse import urljoin - -from apify import Actor -from selenium import webdriver -from selenium.webdriver.chrome.options import Options as ChromeOptions -from selenium.webdriver.common.by import By - - -async def main(): - async with Actor: - # Read the Actor input - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) - max_depth = actor_input.get('max_depth', 1) - - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Enqueue the starting URLs in the default request queue - default_queue = await Actor.open_request_queue() - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) - - # Launch a new Selenium Chrome WebDriver - Actor.log.info('Launching Chrome WebDriver...') - chrome_options = ChromeOptions() - if Actor.config.headless: - chrome_options.add_argument('--headless') - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-dev-shm-usage') - driver = webdriver.Chrome(options=chrome_options) - - driver.get('http://www.example.com') - assert driver.title == 'Example Domain' - - # Process the requests in the queue one by one - while request := await default_queue.fetch_next_request(): - url = request['url'] - depth = request['userData']['depth'] - Actor.log.info(f'Scraping {url} ...') - - try: - # Open the URL in the Selenium WebDriver - driver.get(url) - - # If we haven't reached the max depth, - # look for nested links and enqueue their targets - if depth < max_depth: - for link in driver.find_elements(By.TAG_NAME, 'a'): - link_href = link.get_attribute('href') - link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - await default_queue.add_request({ - 'url': link_url, - 'userData': {'depth': depth + 1 }, - }) - - # Push the title of the page into the default dataset - title = driver.title - await Actor.push_data({ 'url': url, 'title': title }) - except: - Actor.log.exception(f'Cannot extract data from {url}.') - finally: - await default_queue.mark_request_as_handled(request) - - driver.quit() -``` diff --git a/docs/02-guides/05-scrapy.mdx b/docs/02-guides/05-scrapy.mdx deleted file mode 100644 index ea9825d0..00000000 --- a/docs/02-guides/05-scrapy.mdx +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: Using Scrapy -sidebar_label: Using Scrapy ---- - -:::tip - -Our CLI now has native support for running Scrapy spiders on Apify! Check out the [Scrapy migration guide](https://docs.apify.com/cli/docs/integrating-scrapy) for more information. - -::: - -Scrapy is an open-source web scraping framework written in Python. -It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation. - -Some of the key features of Scrapy for web scraping include: - -- **Request and response handling** - Scrapy provides an easy-to-use interface for making HTTP requests and handling responses, -allowing you to navigate through web pages and extract data. -- **Robust Spider framework** - Scrapy has a spider framework that allows you to define how to scrape data from websites, -including how to follow links, how to handle pagination, and how to parse the data. -- **Built-in data extraction** - Scrapy includes built-in support for data extraction using XPath and CSS selectors, -allowing you to easily extract data from HTML and XML documents. -- **Integration with other tool** - Scrapy can be integrated with other Python tools like BeautifulSoup and Selenium for more advanced scraping tasks. - -## Using Scrapy in Actors - -To create Actors which use Scrapy, start from the [Scrapy & Python](https://apify.com/templates?category=python) Actor template. - -This template already contains the structure and setup necessary to integrate Scrapy into your Actors, -setting up the Scrapy settings, asyncio reactor, Actor logger and item pipeline -as necessary to make Scrapy spiders run in Actors and save their outputs in Apify datasets. - -### Manual setup - -If you don't want to use the template, there are several things you need to set up. - -#### Event loop & reactor - -Since the `Actor` class uses `asyncio` under the hood, -Scrapy has to use the [`AsyncioSelectorReactor`](https://docs.scrapy.org/en/latest/topics/asyncio.html) reactor. -And to be able to run the Scrapy engine in an already running loop, -you have to use the [`nest_asyncio`](https://pypi.org/project/nest-asyncio/) package. - -#### Item pipeline - -To push the results into the Actor's default dataset, -the engine has to use a custom [`ItemPipeline`](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) -that calls `Actor.push_data()` on the scraped items. - -## Example Actor - -This is a simple Actor that recursively scrapes titles from all linked websites, -up to a maximum depth, starting from URLs in the Actor input. - -It uses Scrapy download the pages, extract the results from each page, and continue recursively through the website pagination. - -```python title="src/main.py" -from urllib.parse import urljoin -import nest_asyncio -import scrapy -from itemadapter import ItemAdapter -from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings -from scrapy.utils.reactor import install_reactor - -from apify import Actor - -# This is necessary so that twisted and asyncio work well together -install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') -nest_asyncio.apply() - -# Scrapes titles pages and enqueues all links it finds on the page -class TitleSpider(scrapy.Spider): - name = 'title_spider' - - def __init__(self, start_urls, *args, **kwargs): - super().__init__(*args, **kwargs) - self.start_urls = start_urls - - def parse(self, response): - yield { - 'url': response.url, - 'title': response.css('title::text').extract_first(), - } - for link_href in response.css('a::attr("href")'): - link_url = urljoin(response.url, link_href.get()) - if link_url.startswith(('http://', 'https://')): - yield scrapy.Request(link_url) - -# Pushes the scraped items into the Actor's default dataset -class ActorDatasetPushPipeline: - async def process_item(self, item, spider): - item_dict = ItemAdapter(item).asdict() - await Actor.push_data(item_dict) - return item - -async def main(): - async with Actor: - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) - start_urls = [start_url.get('url') for start_url in start_urls] - - settings = get_project_settings() - settings['ITEM_PIPELINES'] = { ActorDatasetPushPipeline: 1 } - settings['TWISTED_REACTOR'] = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' - settings['DEPTH_LIMIT'] = actor_input.get('max_depth', 1) - - process = CrawlerProcess(settings) - process.crawl(TitleSpider, start_urls=start_urls) - process.start() -``` diff --git a/docs/02_guides/01_beautifulsoup_httpx.mdx b/docs/02_guides/01_beautifulsoup_httpx.mdx new file mode 100644 index 00000000..222a4ebe --- /dev/null +++ b/docs/02_guides/01_beautifulsoup_httpx.mdx @@ -0,0 +1,30 @@ +--- +title: Using BeautifulSoup with HTTPX +sidebar_label: Using BeautifulSoup with HTTPX +--- + +import CodeBlock from '@theme/CodeBlock'; + +import BeautifulSoupHttpxExample from '!!raw-loader!./code/01_beautifulsoup_httpx.py'; + +In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library with the [HTTPX](https://www.python-httpx.org/) library in your Apify Actors. + +## Introduction + +`BeautifulSoup` is a Python library for extracting data from HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, enabling efficient data extraction. + +`HTTPX` is a modern, high-level HTTP client library for Python. It provides a simple interface for making HTTP requests and supports both synchronous and asynchronous requests. + +To create an `Actor` which uses those libraries, start from the [BeautifulSoup & Python](https://apify.com/templates/categories/python) Actor template. This template includes the `BeautifulSoup` and `HTTPX` libraries preinstalled, allowing you to begin development immediately. + +## Example Actor + +Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses `HTTPX` for fetching pages and `BeautifulSoup` for parsing their content to extract titles and links to other pages. + + + {BeautifulSoupHttpxExample} + + +## Conclusion + +In this guide, you learned how to use the `BeautifulSoup` with the `HTTPX` in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/02_crawlee.mdx b/docs/02_guides/02_crawlee.mdx new file mode 100644 index 00000000..85847225 --- /dev/null +++ b/docs/02_guides/02_crawlee.mdx @@ -0,0 +1,37 @@ +--- +title: Using Crawlee +sidebar_label: Using Crawlee +--- + +import CodeBlock from '@theme/CodeBlock'; + +import CrawleeBeautifulSoupExample from '!!raw-loader!./code/02_crawlee_beautifulsoup.py'; +import CrawleePlaywrightExample from '!!raw-loader!./code/02_crawlee_playwright.py'; + +In this guide you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. + +## Introduction + +`Crawlee` is a Python library for web scraping and browser automation that provides a robust and flexible framework for building web scraping tasks. It seamlessly integrates with the Apify platform and supports a variety of scraping techniques, from static HTML parsing to dynamic JavaScript-rendered content handling. Crawlee offers a range of crawlers, including HTTP-based crawlers like [`HttpCrawler`](https://crawlee.dev/python/api/class/HttpCrawler), [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) and [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and browser-based crawlers like [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler), to suit different scraping needs. + +In this guide, you'll learn how to use Crawlee with `BeautifulSoupCrawler` and `PlaywrightCrawler` to build Apify Actors for web scraping. + +## Actor with BeautifulSoupCrawler + +The `BeautifulSoupCrawler` is ideal for extracting data from static HTML pages. It uses `BeautifulSoup` for parsing and [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication, ensuring efficient and lightweight scraping. If you do not need to execute JavaScript on the page, `BeautifulSoupCrawler` is a great choice for your scraping tasks. Below is an example of how to use `BeautifulSoupCrawler` in an Apify Actor. + + + {CrawleeBeautifulSoupExample} + + +## Actor with PlaywrightCrawler + +The `PlaywrightCrawler` is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use `PlaywrightCrawler` in an Apify Actor. + + + {CrawleePlaywrightExample} + + +## Conclusion + +In this guide, you learned how to use the `Crawlee` library in your Apify Actors. By using the `BeautifulSoupCrawler` and `PlaywrightCrawler` crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/03_playwright.mdx b/docs/02_guides/03_playwright.mdx new file mode 100644 index 00000000..b8867983 --- /dev/null +++ b/docs/02_guides/03_playwright.mdx @@ -0,0 +1,56 @@ +--- +title: Using Playwright +sidebar_label: Using Playwright +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; + +import PlaywrightExample from '!!raw-loader!./code/03_playwright.py'; + +[Playwright](https://playwright.dev) is a tool for web automation and testing that can also be used for web scraping. It allows you to control a web browser programmatically and interact with web pages just as a human would. + +Some of the key features of Playwright for web scraping include: + +- **Cross-browser support** - Playwright supports the latest versions of major browsers like Chrome, Firefox, and Safari, so you can choose the one that suits your needs the best. +- **Headless mode** - Playwright can run in headless mode, meaning that the browser window is not visible on your screen while it is scraping, which can be useful for running scraping tasks in the background or in containers without a display. +- **Powerful selectors** - Playwright provides a variety of powerful selectors that allow you to target specific elements on a web page, including CSS selectors, XPath, and text matching. +- **Emulation of user interactions** - Playwright allows you to emulate user interactions like clicking, scrolling, filling out forms, and even typing in text, which can be useful for scraping websites that have dynamic content or require user input. + +## Using Playwright in Actors + +To create Actors which use Playwright, start from the [Playwright & Python](https://apify.com/templates/categories/python) Actor template. + +On the Apify platform, the Actor will already have Playwright and the necessary browsers preinstalled in its Docker image, including the tools and setup necessary to run browsers in headful mode. + +When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the Actor. + + + + { +`source .venv/bin/activate +playwright install --with-deps` + } + + + { +`.venv\\Scripts\\activate +playwright install --with-deps` + } + + + +## Example Actor + +This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. + +It uses Playwright to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. + + + {PlaywrightExample} + + +## Conclusion + +In this guide you learned how to create Actors that use Playwright to scrape websites. Playwright is a powerful tool that can be used to manage browser instances and scrape websites that require JavaScript execution. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/04_selenium.mdx b/docs/02_guides/04_selenium.mdx new file mode 100644 index 00000000..9a11a69c --- /dev/null +++ b/docs/02_guides/04_selenium.mdx @@ -0,0 +1,46 @@ +--- +title: Using Selenium +sidebar_label: Using Selenium +--- + +import CodeBlock from '@theme/CodeBlock'; + +import SeleniumExample from '!!raw-loader!./code/04_selenium.py'; + +[Selenium](https://www.selenium.dev/) is a tool for web automation and testing that can also be used for web scraping. It allows you to control a web browser programmatically and interact with web pages just as a human would. + +Some of the key features of Selenium for web scraping include: + +- **Cross-browser support** - Selenium supports the latest versions of major browsers like Chrome, Firefox, and Safari, +so you can choose the one that suits your needs the best. +- **Headless mode** - Selenium can run in headless mode, +meaning that the browser window is not visible on your screen while it is scraping, +which can be useful for running scraping tasks in the background or in containers without a display. +- **Powerful selectors** - Selenium provides a variety of powerful selectors that allow you to target specific elements on a web page, +including CSS selectors, XPath, and text matching. +- **Emulation of user interactions** - Selenium allows you to emulate user interactions like clicking, scrolling, filling out forms, +and even typing in text, which can be useful for scraping websites that have dynamic content or require user input. + +## Using Selenium in Actors + +To create Actors which use Selenium, start from the [Selenium & Python](https://apify.com/templates/categories/python) Actor template. + +On the Apify platform, the Actor will already have Selenium and the necessary browsers preinstalled in its Docker image, +including the tools and setup necessary to run browsers in headful mode. + +When running the Actor locally, you'll need to install the Selenium browser drivers yourself. +Refer to the [Selenium documentation](https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/) for installation instructions. + +## Example Actor + +This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. + +It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. + + + {SeleniumExample} + + +## Conclusion + +In this guide you learned how to use Selenium for web scraping in Apify Actors. You can now create your own Actors that use Selenium to scrape dynamic websites and interact with web pages just like a human would. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx new file mode 100644 index 00000000..b113ee3e --- /dev/null +++ b/docs/02_guides/05_scrapy.mdx @@ -0,0 +1,96 @@ +--- +title: Using Scrapy +sidebar_label: Using Scrapy +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import UnderscoreMainExample from '!!raw-loader!./code/scrapy_src/__main__.py'; +import MainExample from '!!raw-loader!./code/scrapy_src/main.py'; +import ItemsExample from '!!raw-loader!./code/scrapy_src/items.py'; +import SettingsExample from '!!raw-loader!./code/scrapy_src/settings.py'; +import TitleSpiderExample from '!!raw-loader!./code/scrapy_src/spiders/title.py'; + +[Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation. + +:::tip + +Our CLI now supports transforming Scrapy projects into Apify Actors with a single command! Check out the [Scrapy migration guide](https://docs.apify.com/cli/docs/integrating-scrapy) for more information. + +::: + +Some of the key features of Scrapy for web scraping include: + +- **Request and response handling** - Scrapy provides an easy-to-use interface for making HTTP requests and handling responses, +allowing you to navigate through web pages and extract data. +- **Robust Spider framework** - Scrapy has a spider framework that allows you to define how to scrape data from websites, +including how to follow links, how to handle pagination, and how to parse the data. +- **Built-in data extraction** - Scrapy includes built-in support for data extraction using XPath and CSS selectors, +allowing you to easily extract data from HTML and XML documents. +- **Integration with other tool** - Scrapy can be integrated with other Python tools like BeautifulSoup and Selenium for more advanced scraping tasks. + +## Using Scrapy template + +The fastest way to start using Scrapy in Apify Actors is by leveraging the [Scrapy Actor template](https://apify.com/templates/categories/python). This template provides a pre-configured structure and setup necessary to integrate Scrapy into your Actors seamlessly. It includes: setting up the Scrapy settings, `asyncio` reactor, Actor logger, and item pipeline as necessary to make Scrapy spiders run in Actors and save their outputs in Apify datasets. + +## Manual setup + +If you prefer not to use the template, you will need to manually configure several components to integrate Scrapy with the Apify SDK. + +### Event loop & reactor + +The Apify SDK is built on Python's asynchronous [`asyncio`](https://docs.python.org/3/library/asyncio.html) library, whereas Scrapy uses [`twisted`](https://twisted.org/) for its asynchronous operations. To make these two frameworks work together, you need to: + +- Set the [`AsyncioSelectorReactor`](https://docs.scrapy.org/en/latest/topics/asyncio.html#installing-the-asyncio-reactor) in Scrapy's project settings: This reactor is `twisted`'s implementation of the `asyncio` event loop, enabling compatibility between the two libraries. +- Install [`nest_asyncio`](https://pypi.org/project/nest-asyncio/): The `nest_asyncio` package allows the asyncio event loop to run within an already running loop, which is essential for integration with the Apify SDK. + +By making these adjustments, you can ensure collaboration between `twisted`-based Scrapy and the `asyncio`-based Apify SDK. + +### Other components + +We also prepared other Scrapy components to work with Apify SDK, they are available in the [`apify/scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) sub-package. These components include: + +- `ApifyScheduler`: A Scrapy scheduler that uses the Apify Request Queue to manage requests. +- `ApifyHttpProxyMiddleware`: A Scrapy middleware for working with Apify proxies. +- `ActorDatasetPushPipeline`: A Scrapy item pipeline that pushes scraped items into the Apify dataset. + +The module contains other helper functions, like `apply_apify_settings` for applying these components to Scrapy settings, and `to_apify_request` and `to_scrapy_request` for converting between Apify and Scrapy request objects. + +## Example Actor + +Here is an example of a Scrapy Actor that scrapes the titles of web pages and enqueues all links found on each page. This example is identical to the one provided in the Apify Actor templates. + + + + + {UnderscoreMainExample} + + + + + {MainExample} + + + + + {ItemsExample} + + + + + {SettingsExample} + + + + + {TitleSpiderExample} + + + + +## Conclusion + +In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects +using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/02_guides/code/01_beautifulsoup_httpx.py b/docs/02_guides/code/01_beautifulsoup_httpx.py new file mode 100644 index 00000000..36d3bca7 --- /dev/null +++ b/docs/02_guides/code/01_beautifulsoup_httpx.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from httpx import AsyncClient + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Create an HTTPX client to fetch the HTML content of the URLs. + async with AsyncClient() as client: + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an enexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fetch the HTTP response from the specified URL using HTTPX. + response = await client.get(url, follow_redirects=True) + + # Parse the HTML content using Beautiful Soup. + soup = BeautifulSoup(response.content, 'html.parser') + + # If the current depth is less than max_depth, find nested links + # and enqueue them. + if depth < max_depth: + for link in soup.find_all('a'): + link_href = link.get('href') + link_url = urljoin(url, link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data. + data = { + 'url': url, + 'title': soup.title.string if soup.title else None, + 'h1s': [h1.text for h1 in soup.find_all('h1')], + 'h2s': [h2.text for h2 in soup.find_all('h2')], + 'h3s': [h3.text for h3 in soup.find_all('h3')], + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(new_request) diff --git a/docs/02_guides/code/02_crawlee_beautifulsoup.py b/docs/02_guides/code/02_crawlee_beautifulsoup.py new file mode 100644 index 00000000..489d83ae --- /dev/null +++ b/docs/02_guides/code/02_crawlee_beautifulsoup.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + +from apify import Actor + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get( + 'start_urls', + [{'url': 'https://apify.com'}], + ) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a crawler. + crawler = BeautifulSoupCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, + ) + + # Define a request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + Actor.log.info(f'Scraping {url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + 'h1s': [h1.text for h1 in context.soup.find_all('h1')], + 'h2s': [h2.text for h2 in context.soup.find_all('h2')], + 'h3s': [h3.text for h3 in context.soup.find_all('h3')], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) diff --git a/docs/02_guides/code/02_crawlee_playwright.py b/docs/02_guides/code/02_crawlee_playwright.py new file mode 100644 index 00000000..674c1e94 --- /dev/null +++ b/docs/02_guides/code/02_crawlee_playwright.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = [ + url.get('url') + for url in actor_input.get( + 'start_urls', + [{'url': 'https://apify.com'}], + ) + ] + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a crawler. + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=50, + headless=True, + browser_launch_options={ + 'args': ['--disable-gpu'], + }, + ) + + # Define a request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + url = context.request.url + Actor.log.info(f'Scraping {url}...') + + # Extract the desired data. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()], + 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()], + 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], + } + + # Store the extracted data to the default dataset. + await context.push_data(data) + + # Enqueue additional links found on the current page. + await context.enqueue_links() + + # Run the crawler with the starting requests. + await crawler.run(start_urls) diff --git a/docs/02_guides/code/03_playwright.py b/docs/02_guides/code/03_playwright.py new file mode 100644 index 00000000..78ebdda3 --- /dev/null +++ b/docs/02_guides/code/03_playwright.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from urllib.parse import urljoin + +from playwright.async_api import async_playwright + +from apify import Actor, Request + +# Note: To run this Actor locally, ensure that Playwright browsers are installed. +# Run `playwright install --with-deps` in the Actor's virtual environment to install them. +# When running on the Apify platform, these dependencies are already included +# in the Actor's Docker image. + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in actor input, exiting...') + await Actor.exit() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + Actor.log.info('Launching Playwright...') + + # Launch Playwright and open a new browser context. + async with async_playwright() as playwright: + # Configure the browser to launch in headless mode as per Actor configuration. + browser = await playwright.chromium.launch( + headless=Actor.config.headless, + args=['--disable-gpu'], + ) + context = await browser.new_context() + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an enexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Open a new page in the browser context and navigate to the URL. + page = await context.new_page() + await page.goto(url) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. + if depth < max_depth: + for link in await page.locator('a').all(): + link_href = await link.get_attribute('href') + link_url = urljoin(url, link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data. + data = { + 'url': url, + 'title': await page.title(), + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await page.close() + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/02_guides/code/04_selenium.py b/docs/02_guides/code/04_selenium.py new file mode 100644 index 00000000..75c55b2f --- /dev/null +++ b/docs/02_guides/code/04_selenium.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import asyncio +from urllib.parse import urljoin + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.common.by import By + +from apify import Actor, Request + +# To run this Actor locally, you need to have the Selenium Chromedriver installed. +# Follow the installation guide at: +# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/ +# When running on the Apify platform, the Chromedriver is already included +# in the Actor's Docker image. + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in actor input, exiting...') + await Actor.exit() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Launch a new Selenium Chrome WebDriver and configure it. + Actor.log.info('Launching Chrome WebDriver...') + chrome_options = ChromeOptions() + + if Actor.config.headless: + chrome_options.add_argument('--headless') + + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + driver = webdriver.Chrome(options=chrome_options) + + # Test WebDriver setup by navigating to an example page. + driver.get('http://www.example.com') + if driver.title != 'Example Domain': + raise ValueError('Failed to open example page.') + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an enexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread + # for non-blocking execution. + await asyncio.to_thread(driver.get, url) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. + if depth < max_depth: + for link in driver.find_elements(By.TAG_NAME, 'a'): + link_href = link.get_attribute('href') + link_url = urljoin(url, link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data. + data = { + 'url': url, + 'title': driver.title, + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) + + driver.quit() diff --git a/docs/02_guides/code/scrapy_src/__init__.py b/docs/02_guides/code/scrapy_src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/scrapy_src/__main__.py b/docs/02_guides/code/scrapy_src/__main__.py new file mode 100644 index 00000000..56d477dd --- /dev/null +++ b/docs/02_guides/code/scrapy_src/__main__.py @@ -0,0 +1,121 @@ +"""Apify Actor integration for Scrapy projects. + +This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's +logging system, and establishing the required environment to run the Scrapy spider within the Apify platform. + +This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally +or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using +`scrapy crawl title_spider`. + +We recommend you do not modify this file unless you really know what you are doing. +""" + +# ruff: noqa: E402 + +# We need to configure the logging first before we import anything else, so that nothing else imports +# `scrapy.utils.log` before we patch it. +from __future__ import annotations + +from logging import StreamHandler, getLogger +from typing import Any + +from scrapy.utils import log as scrapy_logging +from scrapy.utils.project import get_project_settings + +from apify.log import ActorLogFormatter + +# Define names of the loggers. +MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy'] +OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted'] +ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES + +# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, +# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for +# a specific logger, do it in this file. +settings = get_project_settings() +LOGGING_LEVEL = settings['LOG_LEVEL'] + +# Define a logging handler which will be used for the loggers. +apify_handler = StreamHandler() +apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True)) + + +def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None: + """Configure a logger with the specified settings. + + Args: + logger_name: The name of the logger to be configured. + log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...). + handlers: Optional list of logging handlers. + """ + logger = getLogger(logger_name) + logger.setLevel(log_level) + logger.handlers = [] + + for handler in handlers: + logger.addHandler(handler) + + +# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from +# the `main.py` and Scrapy components. +for logger_name in MAIN_LOGGER_NAMES: + configure_logger(logger_name, LOGGING_LEVEL, apify_handler) + +# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` +# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though +# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method +# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because +# otherwise we would lose some log messages. +old_configure_logging = scrapy_logging.configure_logging + + +def new_configure_logging(*args: Any, **kwargs: Any) -> None: + """Configure logging for Scrapy and root loggers to ensure consistent logging behavior. + + We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root + logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary + loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here + these four loggers and the root logger. + """ + old_configure_logging(*args, **kwargs) + + # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger` + # property within spiders. See details in the Spider logger property: + # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46. + configure_logger(None, LOGGING_LEVEL, apify_handler) + + # We modify other loggers only by setting up their log level. A custom log handler is added + # only to the root logger to avoid duplicate log messages. + for logger_name in ALL_LOGGER_NAMES: + configure_logger(logger_name, LOGGING_LEVEL) + + # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless + # messages, especially when running on the platform. + configure_logger('httpx', 'WARNING') + + +scrapy_logging.configure_logging = new_configure_logging + +# Now we can do the rest of the setup. +import asyncio +import os + +import nest_asyncio +from scrapy.utils.reactor import install_reactor + +from .main import main + +# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is +# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries +# to work together. +# +# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly +# on Windows. +install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') +nest_asyncio.apply() + +# Specify the path to the Scrapy project settings module. +os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' + +# Run the Apify main coroutine in the event loop. +asyncio.run(main()) diff --git a/docs/02_guides/code/scrapy_src/items.py b/docs/02_guides/code/scrapy_src/items.py new file mode 100644 index 00000000..eae7ff23 --- /dev/null +++ b/docs/02_guides/code/scrapy_src/items.py @@ -0,0 +1,17 @@ +"""Scrapy item models module. + +This module defines Scrapy item models for scraped data. Items represent structured data +extracted by spiders. + +For detailed information on creating and utilizing items, refer to the official documentation: +https://docs.scrapy.org/en/latest/topics/items.html +""" + +from scrapy import Field, Item + + +class TitleItem(Item): + """Represents a title item scraped from a web page.""" + + url = Field() + title = Field() diff --git a/docs/02_guides/code/scrapy_src/main.py b/docs/02_guides/code/scrapy_src/main.py new file mode 100644 index 00000000..1a878c5b --- /dev/null +++ b/docs/02_guides/code/scrapy_src/main.py @@ -0,0 +1,60 @@ +"""This module defines the main entry point for the Apify Actor. + +This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine +processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by +applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline +for pushing data to the Apify dataset. + +Customization: +-------------- + +Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy +components like spiders and handling Actor input. However, make sure you have a clear understanding of your +modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify. + +Documentation: +-------------- + +For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and +other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy. +""" + +from __future__ import annotations + +from scrapy.crawler import CrawlerProcess + +# Import your Scrapy spider here. +from .spiders.title import TitleSpider as Spider +from apify import Actor +from apify.scrapy.utils import apply_apify_settings + +# Default input values for local execution using `apify run`. +LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}] + + +async def main() -> None: + """Apify Actor main coroutine for executing the Scrapy spider.""" + # Enter the context of the Actor. + async with Actor: + Actor.log.info('Actor is being executed...') + + # Retrieve and process Actor input. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS) + proxy_config = actor_input.get('proxyConfiguration') + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs. + for start_url in start_urls: + url = start_url.get('url') + await request_queue.add_request(url) + + # Apply Apify settings, it will override the Scrapy project settings. + settings = apply_apify_settings(proxy_config=proxy_config) + + # Execute the spider using Scrapy `CrawlerProcess`. + process = CrawlerProcess(settings, install_root_handler=False) + process.crawl(Spider) + process.start() diff --git a/docs/02_guides/code/scrapy_src/py.typed b/docs/02_guides/code/scrapy_src/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/scrapy_src/settings.py b/docs/02_guides/code/scrapy_src/settings.py new file mode 100644 index 00000000..8a0fd3e6 --- /dev/null +++ b/docs/02_guides/code/scrapy_src/settings.py @@ -0,0 +1,15 @@ +"""Scrapy settings module. + +This module contains Scrapy settings for the project, defining various configurations and options. + +For more comprehensive details on Scrapy settings, refer to the official documentation: +http://doc.scrapy.org/en/latest/topics/settings.html +""" + +BOT_NAME = 'titlebot' +DEPTH_LIMIT = 1 +LOG_LEVEL = 'INFO' +NEWSPIDER_MODULE = 'spiders' +REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' +ROBOTSTXT_OBEY = True +SPIDER_MODULES = ['spiders'] diff --git a/docs/02_guides/code/scrapy_src/spiders/__init__.py b/docs/02_guides/code/scrapy_src/spiders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/scrapy_src/spiders/py.typed b/docs/02_guides/code/scrapy_src/spiders/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/docs/02_guides/code/scrapy_src/spiders/title.py b/docs/02_guides/code/scrapy_src/spiders/title.py new file mode 100644 index 00000000..7be37b68 --- /dev/null +++ b/docs/02_guides/code/scrapy_src/spiders/title.py @@ -0,0 +1,53 @@ +# ruff: noqa: TID252, RUF012 + +from __future__ import annotations + +from typing import TYPE_CHECKING +from urllib.parse import urljoin + +from scrapy import Request, Spider + +from ..items import TitleItem + +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy.responsetypes import Response + + +class TitleSpider(Spider): + """Scrapes title pages and enqueues all links found on the page.""" + + name = 'title_spider' + + # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input + # when the project is executed using Apify. + start_urls = ['https://apify.com/'] + + # Scrape only the pages within the Apify domain. + allowed_domains = ['apify.com'] + + # Limit the number of pages to scrape. + custom_settings = {'CLOSESPIDER_PAGECOUNT': 10} + + def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: + """Parse the web page response. + + Args: + response: The web page response. + + Yields: + Yields scraped TitleItem and Requests for links. + """ + self.logger.info('TitleSpider is parsing %s...', response) + + # Extract and yield the TitleItem + url = response.url + title = response.css('title::text').extract_first() + yield TitleItem(url=url, title=title) + + # Extract all links from the page, create Requests out of them, and yield them + for link_href in response.css('a::attr("href")'): + link_url = urljoin(response.url, link_href.get()) + if link_url.startswith(('http://', 'https://')): + yield Request(link_url) diff --git a/docs/03-concepts/01-actor-lifecycle.mdx b/docs/03-concepts/01-actor-lifecycle.mdx deleted file mode 100644 index 953040da..00000000 --- a/docs/03-concepts/01-actor-lifecycle.mdx +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: Actor lifecycle -sidebar_label: Actor lifecycle ---- - -## Lifecycle methods - -### Initialization and cleanup - -At the start of its runtime, the Actor needs to initialize itself, its event manager and its storages, -and at the end of the runtime it needs to close these cleanly. -The Apify SDK provides several options on how to manage this. - -#### `Actor.init()` and `Actor.exit()` - -The [`Actor.init()`](../../reference/class/Actor#init) method initializes the Actor, -the event manager which processes the Actor events from the platform event websocket, -and the storage client used in the execution environment. -It should be called before performing any other Actor operations. - -The [`Actor.exit()`](../../reference/class/Actor#exit) method then exits the Actor cleanly, -tearing down the event manager and the storage client. -There is also the [`Actor.fail()`](../../reference/class/Actor#fail) method, which exits the Actor while marking it as failed. - -```python title="src/main.py" -from apify import Actor -from apify_shared.consts import ActorExitCodes - -async def main(): - await Actor.init() - try: - Actor.log.info('Actor input:', await Actor.get_input()) - await Actor.set_value('OUTPUT', 'Hello, world!') - raise RuntimeError('Ouch!') - await Actor.exit() - except Exception as e: - Actor.log.exception('Error while running Actor') - await Actor.fail(exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW, exception=e) -``` - -#### Context manager - -So that you don't have to call the lifecycle methods manually, the [`Actor`](../../reference/class/Actor) class provides a context manager, -which calls the [`Actor.init()`](../../reference/class/Actor#init) method on enter, -the [`Actor.exit()`](../../reference/class/Actor#exit) method on a clean exit, -and the [`Actor.fail()`](../../reference/class/Actor#fail) method when there is an exception during the run of the Actor. - -This is the recommended way to work with the `Actor` class. - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - Actor.log.info('Actor input:', await Actor.get_input()) - await Actor.set_value('OUTPUT', 'Hello, world!') - raise RuntimeError('Ouch!') -``` - -### Rebooting an Actor - -Sometimes, you want to restart your Actor to make it run from the beginning again. -To do that, you can use the [`Actor.reboot()`](../../reference/class/Actor#reboot) method. -When you call it, the Apify platform stops the container of the run, -and starts a new container of the same Actor with the same run ID and storages. - -Don't do it unconditionally, or you might get the Actor in a reboot loop. - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - # TODO: figure out a good reason why to reboot - await Actor.reboot() -``` - -## Actor status message - -To inform you or the users running your Actors about the progress of their runs, -you can set the status message for the run, which will then be visible in the run detail in Apify Console, -or accessible through the Apify API. - -To set the status message for the Actor run, you can use the [`Actor.set_status_message()`](../../reference/class/Actor#set_status_message) method. - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - await Actor.set_status_message('Here we go!') - ... - await Actor.set_status_message('So far so good...') - ... - await Actor.set_status_message('Steady as she goes...') - ... - await Actor.set_status_message('Almost there...') - ... - await Actor.set_status_message('Phew! That was not that hard!') -``` diff --git a/docs/03-concepts/02-actor-input.mdx b/docs/03-concepts/02-actor-input.mdx deleted file mode 100644 index 7d9e89b3..00000000 --- a/docs/03-concepts/02-actor-input.mdx +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: Actor input -sidebar_label: Actor input ---- - -The Actor gets its [input](https://docs.apify.com/platform/actors/running/input) from the input record in its default key-value store. - -To access it, instead of reading the record manually, -you can use the [`Actor.get_input()`](../../reference/class/Actor#get_input) convenience method. -It will get the input record key from the Actor configuration, -read the record from the default key-value store, -and decrypt any [secret input fields](https://docs.apify.com/platform/actors/development/secret-input). - -For example, if an Actor received a JSON input with two fields, -`{ "firstNumber": 1, "secondNumber": 2 }`, -this is how you might process it: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - actor_input = await Actor.get_input() or {} - first_number = actor_input.get('firstNumber') - second_number = actor_input.get('secondNumber') - Actor.log.info(f'Sum: {first_number + second_number}') -``` diff --git a/docs/03-concepts/03-storages.mdx b/docs/03-concepts/03-storages.mdx deleted file mode 100644 index 1b68f072..00000000 --- a/docs/03-concepts/03-storages.mdx +++ /dev/null @@ -1,284 +0,0 @@ ---- -title: Working with storages -sidebar_label: Working with storages ---- - -The `Actor` class provides methods to work either with the default storages of the Actor, or with any other storage, named or unnamed. - -## Types of storages - -There are three types of storages available to Actors. - -First are [datasets](https://docs.apify.com/platform/storage/dataset), which are append-only tables for storing the results of your Actors. -You can open a dataset through the [`Actor.open_dataset()`](../../reference/class/Actor#open_dataset) method, -and work with it through the resulting [`Dataset`](../../reference/class/Dataset) class instance. - -Next there are [key-value stores](https://docs.apify.com/platform/storage/key-value-store), -which function as a read/write storage for storing file-like objects, typically the Actor state or binary results. -You can open a key-value store through the [`Actor.open_key_value_store()`](../../reference/class/Actor#open_key_value_store) method, -and work with it through the resulting [`KeyValueStore`](../../reference/class/KeyValueStore) class instance. - -Finally, there are [request queues](https://docs.apify.com/platform/storage/request-queue). -These are queues into which you can put the URLs you want to scrape, -and from which the Actor can dequeue them and process them. -You can open a request queue through the [`Actor.open_request_queue()`](../../reference/class/Actor#open_request_queue) method, -and work with it through the resulting [`RequestQueue`](../../reference/class/RequestQueue) class instance. - -Each Actor run has its default dataset, default key-value store and default request queue. - -## Local storage emulation - -To be able to develop Actors locally, -the storages that the Apify platform provides are emulated on the local filesystem. - -The storage contents are loaded from and saved to the `storage` folder in the Actor's main folder. -Each storage type is stored in its own subfolder, so for example datasets are stored in the `storage/datasets` folder. - -Each storage is then stored in its own folder, named after the storage, or called `default` if it's the default storage. -For example, a request queue with the name `my-queue` would be stored in `storage/request_queues/my-queue`. - -Each dataset item, key-value store record, or request in a request queue is then stored in its own file in the storage folder. -Dataset items and request queue requests are always JSON files, and key-value store records can be any file type, based on its content type. -For example, the Actor input is typically stored in `storage/key_value_stores/default/INPUT.json`. - -### Local storage persistence - -By default, the storage contents are persisted across multiple Actor runs. -To clean up the Actor storages before the running the Actor, -use the `--purge` flag of the [`apify run`](https://docs.apify.com/cli/docs/reference#apify-run) command of the Apify CLI. - -```bash -apify run --purge -``` - -## Convenience methods for working with default storages - -There are several methods for directly working with the default key-value store or default dataset of the Actor. - -[`Actor.get_value('my-record')`](../../reference/class/Actor#get_value) reads a record from the default key-value store of the Actor. - -[`Actor.set_value('my-record', 'my-value')`](../../reference/class/Actor#set_value) saves a new value to the record in the default key-value store. - -[`Actor.get_input()`](../../reference/class/Actor#get_input) reads the Actor input from the default key-value store of the Actor. - -[`Actor.push_data([{'result': 'Hello, world!'}, ...])`](../../reference/class/Actor#push_data) saves results to the default dataset of the Actor. - -## Opening named and unnamed storages - -The [`Actor.open_dataset()`](../../reference/class/Actor#open_dataset), -[`Actor.open_key_value_store()`](../../reference/class/Actor#open_key_value_store) -and [`Actor.open_request_queue()`](../../reference/class/Actor#open_request_queue) methods -can be used to open any storage for reading and writing. -You can either use them without arguments to open the default storages, -or you can pass a storage ID or name to open another storage. - -```python title="src/main.py" -from apify import Actor, Request - -async def main(): - async with Actor: - # Work with the default dataset of the Actor - dataset = await Actor.open_dataset() - await dataset.push_data({'result': 'Hello, world!'}) - - # Work with the key-value store with ID 'mIJVZsRQrDQf4rUAf' - key_value_store = await Actor.open_key_value_store(id='mIJVZsRQrDQf4rUAf') - await key_value_store.set_value('record', 'Hello, world!') - - # Work with the request queue with the name 'my-queue' - request_queue = await Actor.open_request_queue(name='my-queue') - await request_queue.add_request(Request.from_url('https://example.com', unique_key='v0Nr'})) -``` - -## Deleting storages - -To delete a storage, you can use the -[`Dataset.drop()`](../../reference/class/Dataset#drop), -[`KeyValueStore.drop()`](../../reference/class/KeyValueStore#drop) -or [`RequestQueue.drop()`](../../reference/class/RequestQueue#drop) method. - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - # Open a key-value store with the name 'my-cool-store' - key_value_store = await Actor.open_key_value_store(name='my-cool-store') - await key_value_store.set_value('record', 'Hello, world!') - ... - - # Now we don't want it anymore - await key_value_store.drop() -``` - -## Working with datasets - -### Reading & writing items - -To write data into a dataset, you can use the [`Dataset.push_data()`](../../reference/class/Dataset#push_data) method. - -To read data from a dataset, you can use the [`Dataset.get_data()`](../../reference/class/Dataset#get_data) method. - -To get an iterator of the data, you can use the [`Dataset.iterate_items()`](../../reference/class/Dataset#iterate_items) method. - -```python -# Open a dataset and write some data in it -dataset = await Actor.open_dataset(name='my-cool-dataset') -await dataset.push_data([{'itemNo': i} for i in range(1000)]) - -# Read back the first half of the data -first_half = await dataset.get_data(limit=500) -print(first_half['items']) - -# Iterate over the second half -second_half = [] -async for item in dataset.iterate_items(offset=500): - second_half.append(item) -print(second_half) -``` - -### Exporting items - -You can also export the dataset items into a key-value store, as either a CSV or a JSON record, -using the [`Dataset.export_to_csv()`](../../reference/class/Dataset#export_to_csv) -or [`Dataset.export_to_json()`](../../reference/class/Dataset#export_to_json) method. - -```python -# Open a dataset and write some data in it -dataset = await Actor.open_dataset(name='my-cool-dataset') -await dataset.push_data([{'itemNo': i} for i in range(1000)]) - -# Export the data as CSV and JSON -await dataset.export_to_csv('data.csv', to_key_value_store_name='my-cool-key-value-store') -await dataset.export_to_json('data.json', to_key_value_store_name='my-cool-key-value-store') - -# Print the exported records -store = await Actor.open_key_value_store(name='my-cool-key-value-store') -print(await store.get_value('data.csv')) -print(await store.get_value('data.json')) -``` - -## Working with key-value stores - -### Reading and writing records - -To read records from a key-value store, you can use the [`KeyValueStore.get_value()`](../../reference/class/KeyValueStore#get_value) method. - -To write records into a key-value store, you can use the [`KeyValueStore.set_value()`](../../reference/class/KeyValueStore#set_value) method. -You can set the content type of a record with the `content_type` argument. -To delete a record, set its value to `None`. - -```python -# Open a key-value store and write some data in it -store = await Actor.open_key_value_store(name='my-cool-key-value-store') -await store.set_value('automatic_text', 'abcd') -await store.set_value('automatic_json', {'ab': 'cd'}) -await store.set_value('explicit_csv', 'a,b\nc,d', content_type='text/csv') - -# Try that the values are read correctly -print(await store.get_value('automatic_text')) -print(await store.get_value('automatic_json')) -print(await store.get_value('explicit_csv')) - -# Delete the `automatic_text` value -await store.set_value('automatic_text', None) -``` - -### Iterating keys - -To get an iterator of the key-value store record keys, -you can use the [`KeyValueStore.iterate_keys()`](../../reference/class/KeyValueStore#iterate_keys) method. - -```python -# Print the info for each record -print('Records in store:') -async for (key, info) in store.iterate_keys(): - print(f'{key=}, {info=}') -``` - -### Public URLs of records - -To get a publicly accessible URL of a key-value store record, -you can use the [`KeyValueStore.get_public_url()`](../../reference/class/KeyValueStore#get_public_url) method. - -```python -print(f'"my_record" record URL: {await store.get_public_url('my_record')}') -``` - -## Working with request queues - -### Adding requests to a queue - -To add a request into the queue, you can use the [`RequestQueue.add_request()`](../../reference/class/RequestQueue#add_request) method. - -You can use the `forefront` boolean argument to specify whether the request should go to the beginning of the queue, or to the end. - -You can use the `uniqueKey` of the request to uniquely identify a request. If you try to add more requests with the same unique key, -only the first one will be added. - -### Reading requests - -To fetch the next request from the queue for processing, -you can use the [`RequestQueue.fetch_next_request()`](../../reference/class/RequestQueue#fetch_next_request) method. - -To get info about a specific request from the queue, -you can use the [`RequestQueue.get_request()`](../../reference/class/RequestQueue#get_request) method. - -### Handling requests - -To mark a request as handled, you can use the [`RequestQueue.mark_request_as_handled()`](../../reference/class/RequestQueue#mark_request_as_handled) method. - -To mark a request as not handled, so that it gets retried, -you can use the [`RequestQueue.reclaim_request()`](../../reference/class/RequestQueue#reclaim_request) method. - -To check if all the requests in the queue are handled, -you can use the [`RequestQueue.is_finished()`](../../reference/class/RequestQueue#is_finished) method. - -### Full example - -```python title="src/main.py" -import asyncio -import random -from apify import Actor, Request - - -async def main(): - async with Actor: - # Open the queue - queue = await Actor.open_request_queue() - - # Add some requests to the queue - for i in range(1, 10): - await queue.add_request(Request.from_url(f'http://example.com/{i}', unique_key=f'{i}')) - - # Add a request to the start of the queue, for priority processing - await queue.add_request(Request.from_url(f'http://example.com/0', unique_key='0'), forefront=True) - - # If you try to add an existing request again, it will not do anything - operation_info = await queue.add_request(Request.from_url(f'http://different-example.com/5', unique_key='5')) - print(operation_info) - print(await queue.get_request(operation_info['requestId'])) - - # Finally, process the queue until all requests are handled - while not await queue.is_finished(): - # Fetch the next unhandled request in the queue - request = await queue.fetch_next_request() - # This can happen due to the eventual consistency of the underlying request queue storage, - # best solution is just to sleep a bit - if request is None: - await asyncio.sleep(1) - continue - - Actor.log.info(f'Processing request {request["uniqueKey"]}...') - Actor.log.info(f'Scraping URL {request["url"]}...') - - # Do some fake work, which fails 30% of the time - await asyncio.sleep(1) - if random.random() > 0.3: - # If processing the request was successful, mark it as handled - Actor.log.info('Request successful.') - await queue.mark_request_as_handled(request) - else: - # If processing the request was unsuccessful, reclaim it so it can be processed again - Actor.log.warning('Request failed, will retry!') - await queue.reclaim_request(request) diff --git a/docs/03-concepts/05-proxy-management.mdx b/docs/03-concepts/05-proxy-management.mdx deleted file mode 100644 index 7bc53f7e..00000000 --- a/docs/03-concepts/05-proxy-management.mdx +++ /dev/null @@ -1,194 +0,0 @@ ---- -id: proxy-management -title: Proxy management ---- - -[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) -is one of the oldest and most effective ways of preventing access to a website. -It is therefore paramount for a good web scraping library -to provide easy to use but powerful tools which can work around IP blocking. -The most powerful weapon in your anti IP blocking arsenal -is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server). - -With the Apify SDK, you can use your own proxy servers, -proxy servers acquired from third-party providers, -or you can rely on [Apify Proxy](https://apify.com/proxy) for your scraping needs. - -## Quick start - -If you want to use Apify Proxy locally, -make sure that you run your Actors via the Apify CLI -and that you are [logged in](https://docs.apify.com/cli/docs/installation#login-with-your-apify-account) with your Apify account in the CLI. - -### Using Apify Proxy - -```python -proxy_configuration = await Actor.create_proxy_configuration() -proxy_url = await proxy_configuration.new_url() -``` - -### Using your own proxies - -```python -proxy_configuration = await Actor.create_proxy_configuration( - proxy_urls=[ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], -) -proxy_url = await proxy_configuration.new_url() -``` - -## Proxy Configuration - -All your proxy needs are managed by the [`ProxyConfiguration`](../../reference/class/ProxyConfiguration) class. -You create an instance using the [`Actor.create_proxy_configuration()`](../../reference/class/Actor#create_proxy_configuration) method. -Then you generate proxy URLs using the [`ProxyConfiguration.new_url()`](../../reference/class/ProxyConfiguration#new_url) method. - -### Apify Proxy vs. your own proxies - -The `ProxyConfiguration` class covers both Apify Proxy and custom proxy URLs, -so that you can easily switch between proxy providers. -However, some features of the class are available only to Apify Proxy users, -mainly because Apify Proxy is what one would call a super-proxy. -It's not a single proxy server, but an API endpoint that allows connection -through millions of different IP addresses. -So the class essentially has two modes: Apify Proxy or Your proxy. - -The difference is easy to remember. -Using the `proxy_url` or `new_url_function` arguments enables use of your custom proxy URLs, -whereas all the other options are there to configure Apify Proxy. -Visit the [Apify Proxy docs](https://docs.apify.com/proxy) for more info on how these parameters work. - -### IP Rotation and session management - -`proxyConfiguration.new_url()` allows you to pass a `session_id` parameter. -It will then be used to create a `session_id`-`proxy_url` pair, -and subsequent `new_url()` calls with the same `session_id` will always return the same `proxy_url`. -This is extremely useful in scraping, because you want to create the impression of a real user. - -When no `session_id` is provided, your custom proxy URLs are rotated round-robin, -whereas Apify Proxy manages their rotation using black magic to get the best performance. - -```python -proxy_configuration = await Actor.create_proxy_configuration( - proxy_urls=[ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], -) -proxy_url = await proxy_configuration.new_url() # http://proxy-1.com -proxy_url = await proxy_configuration.new_url() # http://proxy-2.com -proxy_url = await proxy_configuration.new_url() # http://proxy-1.com -proxy_url = await proxy_configuration.new_url() # http://proxy-2.com -proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com -proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com -proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com -proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com -``` - -### Apify Proxy Configuration - -With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. -This allows you to get better proxy performance after some initial research. - -```python -proxy_configuration = await Actor.create_proxy_configuration( - groups=['RESIDENTIAL'], - country_code='US', -) -proxy_url = await proxy_configuration.new_url() -``` - -Now your connections using proxy_url will use only Residential proxies from the US. -Note that you must first get access to a proxy group before you are able to use it. -You can find your available proxy groups in the [proxy dashboard](https://console.apify.com/proxy). - -If you don't specify any proxy groups, automatic proxy selection will be used. - -### Your own proxy configuration - -There are two options how to make `ProxyConfiguration` work with your own proxies. - -Either you can pass it a list of your own proxy servers: - -```python -proxy_configuration = await Actor.create_proxy_configuration( - proxy_urls=[ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], -) -proxy_url = await proxy_configuration.new_url() -``` - -Or you can pass it a method (accepting one optional argument, the session ID), -to generate proxy URLs automatically: - -```python -def custom_new_url_function(session_id: Optional[str] = None) -> str: - if session_id is not None: - return f'http://my-custom-proxy-supporting-sessions.com?session-id={session_id} - return 'http://my-custom-proxy-not-supporting-sessions.com' - -proxy_configuration = await Actor.create_proxy_configuration( - new_url_function = custom_new_url_function, -) - -proxy_url_with_session = await proxy_configuration.new_url('a') -proxy_url_without_Session = await proxy_configuration.new_url() -``` - -### Configuring proxy based on Actor input - -To make selecting the proxies that the Actor uses easier, -you can use an input field with the editor [`proxy` in your input schema](https://docs.apify.com/platform/actors/development/input-schema#object). -This input will then be filled with a dictionary containing the proxy settings you or the users of your Actor selected for the Actor run. - -You can then use that input to create the proxy configuration: - -```python -actor_input = await Actor.get_input() or {} -proxy_settings = actor_input.get('proxySettings') -proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings) -proxy_url = await proxy_configuration.new_url() -``` - -## Using the generated proxy URLs - -### Requests - -To use the generated proxy URLs with the `requests` library, -use the [`proxies` argument](https://requests.readthedocs.io/en/latest/user/advanced/#proxies): - -```python -proxy_configuration = await Actor.create_proxy_configuration() -proxy_url = await proxy_configuration.new_url() -proxies = { - 'http': proxy_url, - 'https': proxy_url, -} - -response = requests.get('http://example.com', proxies=proxies) -# --- OR --- -with requests.Session() as session: - session.proxies.update(proxies) - response = session.get('http://example.com') -``` - -### HTTPX - -To use the generated proxy URLs with the `httpx` library, -use the [`proxies` argument](https://www.python-httpx.org/advanced/#http-proxying): - -```python -proxy_configuration = await Actor.create_proxy_configuration() -proxy_url = await proxy_configuration.new_url() - -response = httpx.get('http://example.com', proxy=proxy_url) -# --- OR --- -async with httpx.AsyncClient(proxy=proxy_url) as httpx_client: - response = await httpx_client.get('http://example.com') -``` - - diff --git a/docs/03-concepts/06-interacting-with-other-actors.mdx b/docs/03-concepts/06-interacting-with-other-actors.mdx deleted file mode 100644 index 2f70442f..00000000 --- a/docs/03-concepts/06-interacting-with-other-actors.mdx +++ /dev/null @@ -1,96 +0,0 @@ ---- -title: Interacting with other Actors -sidebar_label: Interacting with other Actors ---- - -There are several methods that interact with other Actors and Actor tasks on the Apify platform. - -## Actor.start() - -The [`Actor.start()`](../../reference/class/Actor#start) method starts another Actor on the Apify platform, -and immediately returns the details of the started Actor run. - -```python -# Start your own Actor named 'my-fancy-actor' -actor_run_details = await Actor.start('~my-fancy-actor', {'foo': 'bar'}) -print(f'Started run ID: {actor_run_details["id"]}') -``` - -## Actor.call() - -The [`Actor.call()`](../../reference/class/Actor#call) method starts another Actor on the Apify platform, -and waits for the started Actor run to finish. - -```python -# Start the `apify/screenshot-url`, wait for it to finish, and get its output -actor_run_details = await Actor.call( - 'apify/screenshot-url', - {'url': 'http://example.com', 'delay': 10000 }, -) -run_client = Actor.apify_client.run(actor_run_details['id']) -screenshot = await run_client.key_value_store().get_value('OUTPUT') -``` - -## Actor.call_task() - -The [`Actor.call_task()`](../../reference/class/Actor#call_task) method -starts an [Actor task](https://docs.apify.com/platform/actors/tasks) on the Apify platform, -and waits for the started Actor run to finish. - -```python -# Start the Actor task with ID `Z3m6FPSj0GYZ25rQc`, -# wait for it to finish, and get its dataset items -task_run_details = await Actor.call_task('Z3m6FPSj0GYZ25rQc') -run_client = Actor.apify_client.run(task_run_details['id']) -task_run_dataset_items = await run_client.dataset().list_items() -``` - -## Actor.metamorph() - -The [`Actor.metamorph()`](../../reference/class/Actor#metamorph) operation transforms an Actor run into a run of another Actor with a new input. -This feature is useful if you want to use another Actor to finish the work of your current Actor, -instead of internally starting a new Actor run and waiting for its finish. -With metamorph, you can easily create new Actors on top of existing ones, -and give your users nicer input structure and user interface for the final Actor. -For the users of your Actors, the metamorph operation is completely transparent; -they will just see your Actor got the work done. - -Internally, the system stops the container corresponding to the original Actor run -and starts a new container using a different container image. -All the default storages are preserved, -and the new Actor input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. - -To make you Actor compatible with the metamorph operation, -use [`Actor.get_input()`](../../reference/class/Actor#get_input) -instead of [`Actor.get_value('INPUT')`](../../reference/class/Actor#get_value) to read your Actor input. -This method will fetch the input using the right key in a case of metamorphed run. - -For example, imagine you have an Actor that accepts a hotel URL on input, -and then internally uses the [`apify/web-scraper`](https://apify.com/apify/web-scraper) public Actor to scrape all the hotel reviews. -The metamorphing code would look as follows: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - # Get the original Actor input - actor_input = await Actor.get_input() or {} - hotel_url = actor_input.get('hotel_url') - - # Create new input for `apify/web-scraper` - web_scraper_input = { - 'startUrls': [{ url: hotelUrl }], - 'pageFunction': """async function pageFunction(context) { - // Here you pass the JavaScript page function - // that scrapes all the reviews from the hotel's URL - }""", - } - - # Metamorph the Actor run to `apify/web-scraper` with the new input - await Actor.metamorph('apify/web-scraper', web_scraper_input) - - # This code will not be called, - # since the `metamorph` action terminates the current Actor run container - print('You will not see this!') -``` diff --git a/docs/03-concepts/07-webhooks.mdx b/docs/03-concepts/07-webhooks.mdx deleted file mode 100644 index b8b6480a..00000000 --- a/docs/03-concepts/07-webhooks.mdx +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Creating webhooks -sidebar_label: Creating webhooks ---- - -Webhooks allow you to configure the Apify platform to perform an action when a certain event occurs. -For example, you can use them to start another Actor when the current run finishes or fails. - -You can learn more in the [documentation for webhooks](https://docs.apify.com/platform/integrations/webhooks). - -## Creating an ad-hoc webhook dynamically - -Besides creating webhooks manually in Apify Console, or through the Apify API, -you can also create [ad-hoc webhooks](https://docs.apify.com/platform/integrations/webhooks/ad-hoc-webhooks) -dynamically from the code of your Actor using the [`Actor.add_webhook()`](../../reference/class/Actor#add_webhook) method: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - await Actor.add_webhook( - event_types: ['ACTOR.RUN.FAILED'], - request_url: 'https://example.com/run-failed', - ) - raise RuntimeError('I am an error and I know it!') -``` - -Note that webhooks are only supported when running on the Apify platform. -When running the Actor locally, the method will print a warning and have no effect. - -### Preventing duplicate webhooks - -To ensure that duplicate ad-hoc webhooks won't get created in a case of Actor restart, -you can use the `idempotencyKey` parameter. -The idempotency key must be unique across all the webhooks of a user so that only one webhook gets created for a given value. -You can use, for example, the Actor run ID as the idempotency key: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - await Actor.add_webhook( - event_types: ['ACTOR.RUN.FAILED'], - request_url: 'https://example.com/run-failed', - idempotency_key: Actor.config.actor_run_id, - ) - raise RuntimeError('I am an error and I know it!') -``` diff --git a/docs/03-concepts/08-access-apify-api.mdx b/docs/03-concepts/08-access-apify-api.mdx deleted file mode 100644 index 83ee6191..00000000 --- a/docs/03-concepts/08-access-apify-api.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Accessing the Apify API -sidebar_label: Accessing Apify API ---- - -The Apify SDK contains many useful features for making Actor development easier. -However, it does not cover all the features the Apify API offers. - -For working with the Apify API directly, -you can use the provided instance of the [Apify API Client](https://docs.apify.com/api/client/python) library. - -## Actor.apify_client - -To access the provided instance of [`ApifyClientAsync`](https://docs.apify.com/api/client/python/reference/class/ApifyClientAsync), -you can use the [`Actor.apify_client`](../../reference/class/Actor#apify_client) property. - -For example, to get the details of your user, you can use this snippet: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - me = await Actor.apify_client.user('me').get() - print(me) -``` - -## Actor.new_client() - -If you want to create a completely new instance of the client, -for example, to get a client for a different user or change the configuration of the client, -you can use the [`Actor.new_client()`](../../reference/class/Actor#new_client) method: - -```python title="src/main.py" -from apify import Actor - -async def main(): - async with Actor: - another_users_client = Actor.new_client(token='ANOTHER_USERS_TOKEN', max_retries=2) - them = await another_users_client.user('me').get() - print(them) -``` diff --git a/docs/03-concepts/09-running-webserver.mdx b/docs/03-concepts/09-running-webserver.mdx deleted file mode 100644 index d1ad9d90..00000000 --- a/docs/03-concepts/09-running-webserver.mdx +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: Running a webserver in your Actor -sidebar_label: Running a webserver ---- - -Each Actor run on the Apify platform is assigned a unique hard-to-guess URL (for example `https://8segt5i81sokzm.runs.apify.net`), -which enables HTTP access to an optional web server running inside the Actor run's container. - -The URL is available in the following places: - -- In Apify Console, on the Actor run details page as the **Container URL** field. -- In the API as the `containerUrl` property of the [Run object](https://docs.apify.com/api/v2#/reference/actors/run-object/get-run). -- In the Actor as the `Actor.config.container_url` property. - -The web server running inside the container must listen at the port defined by the `Actor.config.container_port` property. -When running Actors locally, the port defaults to `4321`, -so the web server will be accessible at `http://localhost:4321`. - -## Example - -The following example demonstrates how to start a simple web server in your Actor, -which will respond to every GET request with the number of items that the Actor has processed so far: - -```python title="src/main.py" -import asyncio -from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer - -from apify import Actor - -processed_items = 0 -http_server = None - -# Just a simple handler that will print the number of processed items so far -# on every GET request -class RequestHandler(BaseHTTPRequestHandler): - def do_GET(self): - self.log_request() - self.send_response(200) - self.end_headers() - self.wfile.write(bytes(f'Processed items: {processed_items}', encoding='utf-8')) - -def run_server(): - # Start the HTTP server on the provided port, - # and save a reference to the server - global http_server - with ThreadingHTTPServer(('', Actor.config.container_port), RequestHandler) as server: - Actor.log.info(f'Server running on {Actor.config.container_url}') - http_server = server - server.serve_forever() - -async def main(): - global processed_items - async with Actor: - # Start the HTTP server in a separate thread - run_server_task = asyncio.get_running_loop().run_in_executor(None, run_server) - - # Simulate doing some work - for _ in range(100): - await asyncio.sleep(1) - processed_items += 1 - Actor.log.info(f'Processed items: {processed_items}') - - # Signal the HTTP server to shut down, and wait for it to finish - http_server.shutdown() - await run_server_task -``` diff --git a/docs/03-concepts/11-configuration.mdx b/docs/03-concepts/11-configuration.mdx deleted file mode 100644 index 9a4ad329..00000000 --- a/docs/03-concepts/11-configuration.mdx +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: Actor configuration and environment variables -sidebar_label: Configuration & env vars ---- - -The [`Actor`](../../reference/class/Actor) class gets configured using the [`Configuration`](../../reference/class/Configuration) class, -which initializes itself based on the provided environment variables. - -If you're using the Apify SDK in your Actors on the Apify platform, or Actors running locally through the Apify CLI, -you don't need to configure the `Actor` class manually, -unless you have some specific requirements, everything will get configured automatically. - -If you need some special configuration, you can adjust it either through the `Configuration` class directly, -or by setting environment variables when running the Actor locally. - -To see the full list of configuration options, check the `Configuration` class -or the list of environment variables that the Actor understands. - -### Configuring from code - -This will cause the Actor to persist its state every 10 seconds: - -```python title="src/main.py" -from datetime import timedelta -from apify import Actor, Event, Configuration - -async def main(): - global_config = Configuration.get_global_configuration() - global_config.persist_state_interval = timedelta(seconds=10) - - async with Actor: - async def save_state(): - await Actor.set_value('STATE', 'Hello, world!') - - # The `save_state` handler will be called every 10 seconds now - Actor.on(Event.PERSIST_STATE, save_state) -``` - -### Configuring via environment variables - -This Actor run will not persist its local storages to the filesystem: - -```bash -APIFY_PERSIST_STORAGE=0 apify run -``` diff --git a/docs/03_concepts/01_actor_lifecycle.mdx b/docs/03_concepts/01_actor_lifecycle.mdx new file mode 100644 index 00000000..e718a019 --- /dev/null +++ b/docs/03_concepts/01_actor_lifecycle.mdx @@ -0,0 +1,55 @@ +--- +title: Actor lifecycle +sidebar_label: Actor lifecycle +--- + +import CodeBlock from '@theme/CodeBlock'; + +import InitExitExample from '!!raw-loader!./code/01_init_exit.py'; +import ContextManagerExample from '!!raw-loader!./code/01_context_manager.py'; +import RebootExample from '!!raw-loader!./code/01_reboot.py'; +import StatusMessageExample from '!!raw-loader!./code/01_status_message.py'; + +In this guide, we will show you how to manage the lifecycle of an Apify Actor. + +## Initialization and cleanup + +At the start of its runtime, the Actor needs to initialize itself, its event manager and its storages, and at the end of the runtime it needs to close these cleanly. The Apify SDK provides several options on how to manage this. + +The [`Actor.init`](../../reference/class/Actor#init) method initializes the Actor, the event manager which processes the Actor events from the platform event websocket, and the storage client used in the execution environment. It should be called before performing any other Actor operations. + +The [`Actor.exit`](../../reference/class/Actor#exit) method then exits the Actor cleanly, tearing down the event manager and the storage client. There is also the [`Actor.fail`](../../reference/class/Actor#fail) method, which exits the Actor while marking it as failed. + + + {InitExitExample} + + +### Context manager + +So that you don't have to call the lifecycle methods manually, the [`Actor`](../../reference/class/Actor) class provides a context manager, which calls the [`Actor.init`](../../reference/class/Actor#init) method on enter, the [`Actor.exit`](../../reference/class/Actor#exit) method on a clean exit, and the [`Actor.fail`](../../reference/class/Actor#fail) method when there is an exception during the run of the Actor. + +This is the recommended way to work with the `Actor` class. + + + {ContextManagerExample} + + +## Rebooting an Actor + +Sometimes, you want to restart your Actor to make it run from the beginning again. To do that, you can use the [`Actor.reboot`](../../reference/class/Actor#reboot) method. When you call it, the Apify platform stops the container of the run, and starts a new container of the same Actor with the same run ID and storages. + +Don't do it unconditionally, or you might get the Actor in a reboot loop. + + + {RebootExample} + + +## Actor status message + +To inform you or the users running your Actors about the progress of their runs, you can set the status message for the run, which will then be visible in the run detail in Apify Console, or accessible through the Apify API. + +To set the status message for the Actor run, you can use the [`Actor.set_status_message`](../../reference/class/Actor#set_status_message) method. + + + {StatusMessageExample} + diff --git a/docs/03_concepts/02_actor_input.mdx b/docs/03_concepts/02_actor_input.mdx new file mode 100644 index 00000000..b3c8bcf8 --- /dev/null +++ b/docs/03_concepts/02_actor_input.mdx @@ -0,0 +1,18 @@ +--- +title: Actor input +sidebar_label: Actor input +--- + +import CodeBlock from '@theme/CodeBlock'; + +import InputExample from '!!raw-loader!./code/02_input.py'; + +The Actor gets its [input](https://docs.apify.com/platform/actors/running/input) from the input record in its default [key-value store](https://docs.apify.com/platform/storage/key-value-store). + +To access it, instead of reading the record manually, you can use the [`Actor.get_input`](../../reference/class/Actor#get_input) convenience method. It will get the input record key from the Actor configuration, read the record from the default key-value store,and decrypt any [secret input fields](https://docs.apify.com/platform/actors/development/secret-input). + +For example, if an Actor received a JSON input with two fields, `{ "firstNumber": 1, "secondNumber": 2 }`, this is how you might process it: + + + {InputExample} + diff --git a/docs/03_concepts/03_storages.mdx b/docs/03_concepts/03_storages.mdx new file mode 100644 index 00000000..51e0eb57 --- /dev/null +++ b/docs/03_concepts/03_storages.mdx @@ -0,0 +1,167 @@ +--- +title: Working with storages +sidebar_label: Working with storages +--- + +import CodeBlock from '@theme/CodeBlock'; + +import OpeningStoragesExample from '!!raw-loader!./code/03_opening_storages.py'; +import DeletingStoragesExample from '!!raw-loader!./code/03_deleting_storages.py'; +import DatasetReadWriteExample from '!!raw-loader!./code/03_dataset_read_write.py'; +import DatasetExportsExample from '!!raw-loader!./code/03_dataset_exports.py'; +import KvsReadWriteExample from '!!raw-loader!./code/03_kvs_read_write.py'; +import KvsIteratingExample from '!!raw-loader!./code/03_kvs_iterating.py'; +import KvsPublicRecordExample from '!!raw-loader!./code/03_kvs_public_url.py'; +import RqExample from '!!raw-loader!./code/03_rq.py'; + +The `Actor` class provides methods to work either with the default storages of the Actor, or with any other storage, named or unnamed. + +## Types of storages + +There are three types of storages available to Actors. + +First are [datasets](https://docs.apify.com/platform/storage/dataset), which are append-only tables for storing the results of your Actors. You can open a dataset through the [`Actor.open_dataset`](../../reference/class/Actor#open_dataset) method, and work with it through the resulting [`Dataset`](../../reference/class/Dataset) class instance. + +Next there are [key-value stores](https://docs.apify.com/platform/storage/key-value-store), which function as a read/write storage for storing file-like objects, typically the Actor state or binary results. You can open a key-value store through the [`Actor.open_key_value_store`](../../reference/class/Actor#open_key_value_store) method, and work with it through the resulting [`KeyValueStore`](../../reference/class/KeyValueStore) class instance. + +Finally, there are [request queues](https://docs.apify.com/platform/storage/request-queue). These are queues into which you can put the URLs you want to scrape, and from which the Actor can dequeue them and process them. You can open a request queue through the [`Actor.open_request_queue`](../../reference/class/Actor#open_request_queue) method, and work with it through the resulting [`RequestQueue`](../../reference/class/RequestQueue) class instance. + +Each Actor run has its default dataset, default key-value store and default request queue. + +## Local storage emulation + +To be able to develop Actors locally, the storages that the Apify platform provides are emulated on the local filesystem. + +The storage contents are loaded from and saved to the `storage` folder in the Actor's main folder. Each storage type is stored in its own subfolder, so for example datasets are stored in the `storage/datasets` folder. + +Each storage is then stored in its own folder, named after the storage, or called `default` if it's the default storage. For example, a request queue with the name `my-queue` would be stored in `storage/request_queues/my-queue`. + +Each dataset item, key-value store record, or request in a request queue is then stored in its own file in the storage folder. Dataset items and request queue requests are always JSON files, and key-value store records can be any file type, based on its content type. For example, the Actor input is typically stored in `storage/key_value_stores/default/INPUT.json`. + +### Local storage persistence + +By default, the storage contents are persisted across multiple Actor runs. To clean up the Actor storages before the running the Actor, use the `--purge` flag of the [`apify run`](https://docs.apify.com/cli/docs/reference#apify-run) command of the Apify CLI. + +```bash +apify run --purge +``` + +## Convenience methods for working with default storages + +There are several methods for directly working with the default key-value store or default dataset of the Actor. + +- [`Actor.get_value('my-record')`](../../reference/class/Actor#get_value) reads a record from the default key-value store of the Actor. +- [`Actor.set_value('my-record', 'my-value')`](../../reference/class/Actor#set_value) saves a new value to the record in the default key-value store. +- [`Actor.get_input`](../../reference/class/Actor#get_input) reads the Actor input from the default key-value store of the Actor. +- [`Actor.push_data([{'result': 'Hello, world!'}, ...])`](../../reference/class/Actor#push_data) saves results to the default dataset of the Actor. + +## Opening named and unnamed storages + +The [`Actor.open_dataset`](../../reference/class/Actor#open_dataset), [`Actor.open_key_value_store`](../../reference/class/Actor#open_key_value_store) and [`Actor.open_request_queue`](../../reference/class/Actor#open_request_queue) methods can be used to open any storage for reading and writing. You can either use them without arguments to open the default storages, or you can pass a storage ID or name to open another storage. + + + {OpeningStoragesExample} + + +## Deleting storages + +To delete a storage, you can use the [`Dataset.drop`](../../reference/class/Dataset#drop), +[`KeyValueStore.drop`](../../reference/class/KeyValueStore#drop) or [`RequestQueue.drop`](../../reference/class/RequestQueue#drop) methods. + + + {DeletingStoragesExample} + + +## Working with datasets + +In this section we will show you how to work with [datasets](https://docs.apify.com/platform/storage/dataset). + +### Reading & writing items + +To write data into a dataset, you can use the [`Dataset.push_data`](../../reference/class/Dataset#push_data) method. + +To read data from a dataset, you can use the [`Dataset.get_data`](../../reference/class/Dataset#get_data) method. + +To get an iterator of the data, you can use the [`Dataset.iterate_items`](../../reference/class/Dataset#iterate_items) method. + + + {DatasetReadWriteExample} + + +### Exporting items + +You can also export the dataset items into a key-value store, as either a CSV or a JSON record, +using the [`Dataset.export_to_csv`](../../reference/class/Dataset#export_to_csv) +or [`Dataset.export_to_json`](../../reference/class/Dataset#export_to_json) method. + + + {DatasetExportsExample} + + +## Working with key-value stores + +In this section we will show you how to work with [key-value stores](https://docs.apify.com/platform/storage/key-value-store). + +### Reading and writing records + +To read records from a key-value store, you can use the [`KeyValueStore.get_value`](../../reference/class/KeyValueStore#get_value) method. + +To write records into a key-value store, you can use the [`KeyValueStore.set_value`](../../reference/class/KeyValueStore#set_value) method. +You can set the content type of a record with the `content_type` argument. +To delete a record, set its value to `None`. + + + {KvsReadWriteExample} + + +### Iterating keys + +To get an iterator of the key-value store record keys, +you can use the [`KeyValueStore.iterate_keys`](../../reference/class/KeyValueStore#iterate_keys) method. + + + {KvsIteratingExample} + + +### Public URLs of records + +To get a publicly accessible URL of a key-value store record, +you can use the [`KeyValueStore.get_public_url`](../../reference/class/KeyValueStore#get_public_url) method. + + + {KvsPublicRecordExample} + + +## Working with request queues + +In this section we will show you how to work with [request queues](https://docs.apify.com/platform/storage/request-queue). + +### Adding requests to a queue + +To add a request into the queue, you can use the [`RequestQueue.add_request`](../../reference/class/RequestQueue#add_request) method. + +You can use the `forefront` boolean argument to specify whether the request should go to the beginning of the queue, or to the end. + +You can use the `unique_key` of the request to uniquely identify a request. If you try to add more requests with the same unique key, only the first one will be added. + +Check out the [`Request`](../../reference/class/Request) for more information on how to create requests and what properties they have. + +### Reading requests + +To fetch the next request from the queue for processing, you can use the [`RequestQueue.fetch_next_request`](../../reference/class/RequestQueue#fetch_next_request) method. + +To get info about a specific request from the queue, you can use the [`RequestQueue.get_request`](../../reference/class/RequestQueue#get_request) method. + +### Handling requests + +To mark a request as handled, you can use the [`RequestQueue.mark_request_as_handled`](../../reference/class/RequestQueue#mark_request_as_handled) method. + +To mark a request as not handled, so that it gets retried, you can use the [`RequestQueue.reclaim_request`](../../reference/class/RequestQueue#reclaim_request) method. + +To check if all the requests in the queue are handled, you can use the [`RequestQueue.is_finished`](../../reference/class/RequestQueue#is_finished) method. + +### Full example + + + {RqExample} + diff --git a/docs/03-concepts/04-actor-events.mdx b/docs/03_concepts/04_actor_events.mdx similarity index 57% rename from docs/03-concepts/04-actor-events.mdx rename to docs/03_concepts/04_actor_events.mdx index 91ad0695..5df0858e 100644 --- a/docs/03-concepts/04-actor-events.mdx +++ b/docs/03_concepts/04_actor_events.mdx @@ -3,11 +3,14 @@ title: Handling Actor events & persisting state sidebar_label: Actor events & state persistence --- +import CodeBlock from '@theme/CodeBlock'; + +import ActorEventsExample from '!!raw-loader!./code/04_actor_events.py'; + During its runtime, the Actor receives Actor events sent by the Apify platform or generated by the Apify SDK itself. ## Event types - @@ -20,15 +23,15 @@ During its runtime, the Actor receives Actor events sent by the Apify platform o @@ -40,7 +43,7 @@ During its runtime, the Actor receives Actor events sent by the Apify platform o {' '}to another worker server soon.

You can use it to persist the state of the Actor so that once it is executed again on the new server, it doesn't have to start over from the beginning. - Once you have persisted the state of your Actor, you can call Actor.reboot() + Once you have persisted the state of your Actor, you can call Actor.reboot to reboot the Actor and trigger the migration immediately, to speed up the process. @@ -55,12 +58,12 @@ During its runtime, the Actor receives Actor events sent by the Apify platform o - + @@ -68,47 +71,11 @@ During its runtime, the Actor receives Actor events sent by the Apify platform o
SYSTEM_INFO
{`{
-  "createdAt": datetime,
-  "cpuCurrentUsage": float,
-  "memCurrentBytes": int,
-  "isCpuOverloaded": bool
+  "created_at": datetime,
+  "cpu_current_usage": float,
+  "mem_current_bytes": int,
+  "is_cpu_overloaded": bool
 }`}
             

This event is emitted regularly and it indicates the current resource usage of the Actor.

- The isCpuOverloaded argument indicates whether the current CPU usage is higher than Config.max_used_cpu_ratio + The is_cpu_overloaded argument indicates whether the current CPU usage is higher than Config.max_used_cpu_ratio
PERSIST_STATE
{`{ "isMigrating": bool }`}
{`{ "is_migrating": bool }`}

Emitted in regular intervals (by default 60 seconds) to notify the Actor that it should persist its state, in order to avoid repeating all work when the Actor restarts.

This event is also emitted automatically when the MIGRATING event happens, - in which case the isMigrating flag is set to True.

+ in which case the is_migrating flag is set to True.

Note that the PERSIST_STATE event is provided merely for user convenience, you can achieve the same effect by persisting the state regularly in an interval and listening for the migrating event.
- - ## Adding handlers to events -To add handlers to these events, you use the [`Actor.on()`](../../reference/class/Actor#on) method, -and to remove them, you use the [`Actor.off()`](../../reference/class/Actor#off) method. - -```python title="src/main.py" -import asyncio -from apify import Actor, Event - -async def main(): - async with Actor: - total_items = 1000 - - # Load the state if it's saved from some previous execution - processed_items = 0 - actor_state = await Actor.get_value('STATE') - if actor_state is not None: - processed_items = actor_state - - # Save the state when the `PERSIST_STATE` event happens - async def save_state(event_data): - nonlocal processed_items - Actor.log.info('Saving Actor state', extra=event_data) - await Actor.set_value('STATE', processed_items) - - Actor.on(Event.PERSIST_STATE, save_state) - - # Do some fake work - for i in range(processed_items, total_items): - Actor.log.info(f'Processing item {i}...') - processed_items = i - await asyncio.sleep(0.1) - - # Suppose we can stop saving the state now - Actor.off(Event.PERSIST_STATE, save_state) +To add handlers to these events, you use the [`Actor.on`](../../reference/class/Actor#on) method, +and to remove them, you use the [`Actor.off`](../../reference/class/Actor#off) method. - # Do some more fake work, this time something that can't be restarted, - # so no point persisting the state - for j in range(0, 10): - Actor.log.info(f'Processing item {j} of another kind...') - await asyncio.sleep(1) -``` + + {ActorEventsExample} + diff --git a/docs/03_concepts/05_proxy_management.mdx b/docs/03_concepts/05_proxy_management.mdx new file mode 100644 index 00000000..1f15cfae --- /dev/null +++ b/docs/03_concepts/05_proxy_management.mdx @@ -0,0 +1,108 @@ +--- +id: proxy-management +title: Proxy management +--- + +import CodeBlock from '@theme/CodeBlock'; + +import ApifyProxyExample from '!!raw-loader!./code/05_apify_proxy.py'; +import CustomProxyExample from '!!raw-loader!./code/05_custom_proxy.py'; +import ProxyRotationExample from '!!raw-loader!./code/05_proxy_rotation.py'; +import ApifyProxyConfig from '!!raw-loader!./code/05_apify_proxy_config.py'; +import CustomProxyFunctionExample from '!!raw-loader!./code/05_custom_proxy_function.py'; +import ProxyActorInputExample from '!!raw-loader!./code/05_proxy_actor_input.py'; +import ProxyHttpxExample from '!!raw-loader!./code/05_proxy_httpx.py'; + +[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in your anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server). + +With the Apify SDK, you can use your own proxy servers, proxy servers acquired from third-party providers, or you can rely on [Apify Proxy](https://apify.com/proxy) for your scraping needs. + +## Quick start + +If you want to use Apify Proxy locally, make sure that you run your Actors via the Apify CLI and that you are [logged in](https://docs.apify.com/cli/docs/installation#login-with-your-apify-account) with your Apify account in the CLI. + +### Using Apify proxy + + + {ApifyProxyExample} + + +### Using your own proxies + + + {CustomProxyExample} + + +## Proxy configuration + +All your proxy needs are managed by the [`ProxyConfiguration`](../../reference/class/ProxyConfiguration) class. You create an instance using the [`Actor.create_proxy_configuration()`](../../reference/class/Actor#create_proxy_configuration) method. Then you generate proxy URLs using the [`ProxyConfiguration.new_url()`](../../reference/class/ProxyConfiguration#new_url) method. + +### Apify proxy vs. your own proxies + +The `ProxyConfiguration` class covers both Apify Proxy and custom proxy URLs, so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connectionthrough millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Your proxy. + +The difference is easy to remember. Using the `proxy_url` or `new_url_function` arguments enables use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. Visit the [Apify Proxy docs](https://docs.apify.com/proxy) for more info on how these parameters work. + +### IP rotation and session management + +`ProxyConfiguration.new_url` allows you to pass a `session_id` parameter. It will then be used to create a `session_id`-`proxy_url` pair, and subsequent `new_url()` calls with the same `session_id` will always return the same `proxy_url`. This is extremely useful in scraping, because you want to create the impression of a real user. + +When no `session_id` is provided, your custom proxy URLs are rotated round-robin, whereas Apify Proxy manages their rotation using black magic to get the best performance. + + + {ProxyRotationExample} + + +### Apify proxy configuration + +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. This allows you to get better proxy performance after some initial research. + + + {ApifyProxyConfig} + + +Now your connections using proxy_url will use only Residential proxies from the US. Note that you must first get access to a proxy group before you are able to use it. You can find your available proxy groups in the [proxy dashboard](https://console.apify.com/proxy). + +If you don't specify any proxy groups, automatic proxy selection will be used. + +### Your own proxy configuration + +There are two options how to make `ProxyConfiguration` work with your own proxies. + +Either you can pass it a list of your own proxy servers: + + + {CustomProxyExample} + + +Or you can pass it a method (accepting one optional argument, the session ID), to generate proxy URLs automatically: + + + {CustomProxyFunctionExample} + + +### Configuring proxy based on Actor input + +To make selecting the proxies that the Actor uses easier, you can use an input field with the editor [`proxy` in your input schema](https://docs.apify.com/platform/actors/development/input-schema#object). This input will then be filled with a dictionary containing the proxy settings you or the users of your Actor selected for the Actor run. + +You can then use that input to create the proxy configuration: + + + {ProxyActorInputExample} + + +## Using the generated proxy URLs + +### HTTPX + +To use the generated proxy URLs with the `httpx` library, use the [`proxies`](https://www.python-httpx.org/advanced/#http-proxying) argument: + + + {ProxyHttpxExample} + + +Make sure you have the `httpx` library installed: + +```bash +pip install httpx +``` diff --git a/docs/03_concepts/06_interacting_with_other_actors.mdx b/docs/03_concepts/06_interacting_with_other_actors.mdx new file mode 100644 index 00000000..ea1523d6 --- /dev/null +++ b/docs/03_concepts/06_interacting_with_other_actors.mdx @@ -0,0 +1,51 @@ +--- +title: Interacting with other Actors +sidebar_label: Interacting with other Actors +--- + +import CodeBlock from '@theme/CodeBlock'; + +import InteractingStartExample from '!!raw-loader!./code/06_interacting_start.py'; +import InteractingCallExample from '!!raw-loader!./code/06_interacting_call.py'; +import InteractingCallTaskExample from '!!raw-loader!./code/06_interacting_call_task.py'; +import InteractingMetamorphExample from '!!raw-loader!./code/06_interacting_metamorph.py'; + +There are several methods that interact with other Actors and Actor tasks on the Apify platform. + +## Actor start + +The [`Actor.start`](../../reference/class/Actor#start) method starts another Actor on the Apify platform, and immediately returns the details of the started Actor run. + + + {InteractingStartExample} + + +## Actor call + +The [`Actor.call`](../../reference/class/Actor#call) method starts another Actor on the Apify platform, and waits for the started Actor run to finish. + + + {InteractingCallExample} + + +## Actor call task + +The [`Actor.call_task`](../../reference/class/Actor#call_task) method starts an [Actor task](https://docs.apify.com/platform/actors/tasks) on the Apify platform, and waits for the started Actor run to finish. + + + {InteractingCallTaskExample} + + +## Actor metamorph + +The [`Actor.metamorph`](../../reference/class/Actor#metamorph) operation transforms an Actor run into a run of another Actor with a new input. This feature is useful if you want to use another Actor to finish the work of your current Actor, instead of internally starting a new Actor run and waiting for its finish. With metamorph, you can easily create new Actors on top of existing ones, and give your users nicer input structure and user interface for the final Actor. For the users of your Actors, the metamorph operation is completely transparent; they will just see your Actor got the work done. + +Internally, the system stops the container corresponding to the original Actor run and starts a new container using a different container image. All the default storages are preserved,and the new Actor input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. + +To make you Actor compatible with the metamorph operation, use [`Actor.get_input`](../../reference/class/Actor#get_input) instead of [`Actor.get_value('INPUT')`](../../reference/class/Actor#get_value) to read your Actor input. This method will fetch the input using the right key in a case of metamorphed run. + +For example, imagine you have an Actor that accepts a hotel URL on input, and then internally uses the [`apify/web-scraper`](https://apify.com/apify/web-scraper) public Actor to scrape all the hotel reviews. The metamorphing code would look as follows: + + + {InteractingMetamorphExample} + diff --git a/docs/03_concepts/07_webhooks.mdx b/docs/03_concepts/07_webhooks.mdx new file mode 100644 index 00000000..0d9b218c --- /dev/null +++ b/docs/03_concepts/07_webhooks.mdx @@ -0,0 +1,31 @@ +--- +title: Creating webhooks +sidebar_label: Creating webhooks +--- + +import CodeBlock from '@theme/CodeBlock'; + +import WebhookExample from '!!raw-loader!./code/07_webhook.py'; +import WebhookPreventingExample from '!!raw-loader!./code/07_webhook_preventing.py'; + +Webhooks allow you to configure the Apify platform to perform an action when a certain event occurs. For example, you can use them to start another Actor when the current run finishes or fails. + +You can learn more in the [documentation for webhooks](https://docs.apify.com/platform/integrations/webhooks). + +## Creating an ad-hoc webhook dynamically + +Besides creating webhooks manually in Apify Console, or through the Apify API,you can also create [ad-hoc webhooks](https://docs.apify.com/platform/integrations/webhooks/ad-hoc-webhooks) dynamically from the code of your Actor using the [`Actor.add_webhook`](../../reference/class/Actor#add_webhook) method: + + + {WebhookExample} + + +Note that webhooks are only supported when running on the Apify platform. When running the Actor locally, the method will print a warning and have no effect. + +## Preventing duplicate webhooks + +To ensure that duplicate ad-hoc webhooks won't get created in a case of Actor restart, you can use the `idempotency_key` parameter. The idempotency key must be unique across all the webhooks of a user so that only one webhook gets created for a given value. You can use, for example, the Actor run ID as the idempotency key: + + + {WebhookPreventingExample} + diff --git a/docs/03_concepts/08_access_apify_api.mdx b/docs/03_concepts/08_access_apify_api.mdx new file mode 100644 index 00000000..961907d8 --- /dev/null +++ b/docs/03_concepts/08_access_apify_api.mdx @@ -0,0 +1,31 @@ +--- +title: Accessing the Apify API +sidebar_label: Accessing Apify API +--- + +import CodeBlock from '@theme/CodeBlock'; + +import ActorClientExample from '!!raw-loader!./code/08_actor_client.py'; +import ActorNewClientExample from '!!raw-loader!./code/08_actor_new_client.py'; + +The Apify SDK contains many useful features for making Actor development easier. However, it does not cover all the features the Apify API offers. + +For working with the Apify API directly, you can use the provided instance of the [Apify API Client](https://docs.apify.com/api/client/python) library. + +## Actor client + +To access the provided instance of [`ApifyClientAsync`](https://docs.apify.com/api/client/python/reference/class/ApifyClientAsync), you can use the [`Actor.apify_client`](../../reference/class/Actor#apify_client) property. + +For example, to get the details of your user, you can use this snippet: + + + {ActorClientExample} + + +## Actor new client + +If you want to create a completely new instance of the client, for example, to get a client for a different user or change the configuration of the client,you can use the [`Actor.new_client`](../../reference/class/Actor#new_client) method: + + + {ActorNewClientExample} + diff --git a/docs/03_concepts/09_running_webserver.mdx b/docs/03_concepts/09_running_webserver.mdx new file mode 100644 index 00000000..abee68c4 --- /dev/null +++ b/docs/03_concepts/09_running_webserver.mdx @@ -0,0 +1,26 @@ +--- +title: Running a webserver in your Actor +sidebar_label: Running a webserver +--- + +import CodeBlock from '@theme/CodeBlock'; + +import WebserverExample from '!!raw-loader!./code/09_webserver.py'; + +Each Actor run on the Apify platform is assigned a unique hard-to-guess URL (for example `https://8segt5i81sokzm.runs.apify.net`), which enables HTTP access to an optional web server running inside the Actor run's container. + +The URL is available in the following places: + +- In Apify Console, on the Actor run details page as the **Container URL** field. +- In the API as the `container_url` property of the [Run object](https://docs.apify.com/api/v2#/reference/actors/run-object/get-run). +- In the Actor as the `Actor.config.container_url` property. + +The web server running inside the container must listen at the port defined by the `Actor.config.container_port` property. When running Actors locally, the port defaults to `4321`, so the web server will be accessible at `http://localhost:4321`. + +## Example + +The following example demonstrates how to start a simple web server in your Actor,which will respond to every GET request with the number of items that the Actor has processed so far: + + + {WebserverExample} + diff --git a/docs/03-concepts/10-logging.mdx b/docs/03_concepts/10_logging.mdx similarity index 52% rename from docs/03-concepts/10-logging.mdx rename to docs/03_concepts/10_logging.mdx index 33c98b29..2e5d5c2a 100644 --- a/docs/03-concepts/10-logging.mdx +++ b/docs/03_concepts/10_logging.mdx @@ -3,71 +3,46 @@ title: Logging sidebar_label: Logging --- -The Apify SDK is logging useful information through the [`logging`](https://docs.python.org/3/library/logging.html) module -from Python's standard library, into the logger with the name `apify`. +import CodeBlock from '@theme/CodeBlock'; + +import LogConfigExample from '!!raw-loader!./code/10_log_config.py'; +import LoggerUsageExample from '!!raw-loader!./code/10_logger_usage.py'; + +The Apify SDK is logging useful information through the [`logging`](https://docs.python.org/3/library/logging.html) module from Python's standard library, into the logger with the name `apify`. ## Automatic configuration -When you create an Actor from an Apify-provided template, either in Apify Console or through the Apify CLI, -you do not have to configure the logger yourself. -The template already contains initialization code for the logger, -which sets the logger level to `DEBUG` and the log formatter to [`ActorLogFormatter`](../../reference/class/ActorLogFormatter). +When you create an Actor from an Apify-provided template, either in Apify Console or through the Apify CLI, you do not have to configure the logger yourself. The template already contains initialization code for the logger,which sets the logger level to `DEBUG` and the log formatter to [`ActorLogFormatter`](../../reference/class/ActorLogFormatter). ## Manual configuration ### Configuring the log level -In Python's default behavior, if you don't configure the logger otherwise, -only logs with level `WARNING` or higher are printed out to the standard output, without any formatting. -To also have logs with `DEBUG` and `INFO` level printed out, -you need to call the [`Logger.setLevel()`](https://docs.python.org/3/library/logging.html#logging.Logger.setLevel) method on the logger, -with the desired minimum level as an argument. +In Python's default behavior, if you don't configure the logger otherwise, only logs with level `WARNING` or higher are printed out to the standard output, without any formatting. To also have logs with `DEBUG` and `INFO` level printed out, you need to call the [`Logger.setLevel`](https://docs.python.org/3/library/logging.html#logging.Logger.setLevel) method on the logger, with the desired minimum level as an argument. ### Configuring the log formatting -By default, only the log message is printed out to the output, without any formatting. -To have a nicer output, with the log level printed in color, the messages nicely aligned, and extra log fields printed out, -you can use the [`ActorLogFormatter`](../../reference/class/ActorLogFormatter) class from the `apify.log` module. +By default, only the log message is printed out to the output, without any formatting. To have a nicer output, with the log level printed in color, the messages nicely aligned, and extra log fields printed out,you can use the [`ActorLogFormatter`](../../reference/class/ActorLogFormatter) class from the `apify.log` module. ### Example log configuration To configure and test the logger, you can use this snippet: -```python -import logging - -from apify import Actor -from apify.log import ActorLogFormatter + + {LogConfigExample} + -handler = logging.StreamHandler() -handler.setFormatter(ActorLogFormatter()) - -apify_logger = logging.getLogger('apify') -apify_logger.setLevel(logging.DEBUG) -apify_logger.addHandler(handler) -``` - -This configuration will cause all levels of messages to be printed to the standard output, -with some pretty formatting. +This configuration will cause all levels of messages to be printed to the standard output, with some pretty formatting. ## Logger usage Here you can see how all the log levels would look like. -You can use the `extra` argument for all log levels, it's not specific to the warning level. -When you use `Logger.exception()`, there is no need to pass the Exception object to the log manually, -it will automatiacally infer it from the current execution context and print the exception details. - -```python -Actor.log.debug('This is a debug message') -Actor.log.info('This is an info message') -Actor.log.warning('This is a warning message', extra={'reason': 'Bad Actor!'}) -Actor.log.error('This is an error message') -try: - raise RuntimeError('Ouch!') -except: - Actor.log.exception('This is an exceptional message') -``` +You can use the `extra` argument for all log levels, it's not specific to the warning level. When you use `Logger.exception`, there is no need to pass the Exception object to the log manually, it will automatiacally infer it from the current execution context and print the exception details. + + + {LoggerUsageExample} + Result: diff --git a/docs/03_concepts/11_configuration.mdx b/docs/03_concepts/11_configuration.mdx new file mode 100644 index 00000000..2435421f --- /dev/null +++ b/docs/03_concepts/11_configuration.mdx @@ -0,0 +1,32 @@ +--- +title: Actor configuration and environment variables +sidebar_label: Configuration & env vars +--- + +import CodeBlock from '@theme/CodeBlock'; + +import ConfigExample from '!!raw-loader!./code/11_config.py'; + +The [`Actor`](../../reference/class/Actor) class gets configured using the [`Configuration`](../../reference/class/Configuration) class, which initializes itself based on the provided environment variables. + +If you're using the Apify SDK in your Actors on the Apify platform, or Actors running locally through the Apify CLI, you don't need to configure the `Actor` class manually,unless you have some specific requirements, everything will get configured automatically. + +If you need some special configuration, you can adjust it either through the `Configuration` class directly,or by setting environment variables when running the Actor locally. + +To see the full list of configuration options, check the `Configuration` class or the list of environment variables that the Actor understands. + +## Configuring from code + +This will cause the Actor to persist its state every 10 seconds: + + + {ConfigExample} + + +## Configuring via environment variables + +This Actor run will not persist its local storages to the filesystem: + +```bash +APIFY_PERSIST_STORAGE=0 apify run +``` diff --git a/docs/03_concepts/code/01_context_manager.py b/docs/03_concepts/code/01_context_manager.py new file mode 100644 index 00000000..8a3e8654 --- /dev/null +++ b/docs/03_concepts/code/01_context_manager.py @@ -0,0 +1,9 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() + Actor.log.info('Actor input: %s', actor_input) + await Actor.set_value('OUTPUT', 'Hello, world!') + raise RuntimeError('Ouch!') diff --git a/docs/03_concepts/code/01_init_exit.py b/docs/03_concepts/code/01_init_exit.py new file mode 100644 index 00000000..674c9285 --- /dev/null +++ b/docs/03_concepts/code/01_init_exit.py @@ -0,0 +1,16 @@ +from apify import Actor + + +async def main() -> None: + await Actor.init() + + try: + Actor.log.info('Actor input:', await Actor.get_input()) + await Actor.set_value('OUTPUT', 'Hello, world!') + raise RuntimeError('Ouch!') + + except Exception as exc: + Actor.log.exception('Error while running Actor') + await Actor.fail(exit_code=91, exception=exc) + + await Actor.exit() diff --git a/docs/03_concepts/code/01_reboot.py b/docs/03_concepts/code/01_reboot.py new file mode 100644 index 00000000..e398c5f4 --- /dev/null +++ b/docs/03_concepts/code/01_reboot.py @@ -0,0 +1,7 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # ... your code here ... + await Actor.reboot() diff --git a/docs/03_concepts/code/01_status_message.py b/docs/03_concepts/code/01_status_message.py new file mode 100644 index 00000000..13bf2b34 --- /dev/null +++ b/docs/03_concepts/code/01_status_message.py @@ -0,0 +1,14 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + await Actor.set_status_message('Here we go!') + # Do some work... + await Actor.set_status_message('So far so good...') + # Do some more work... + await Actor.set_status_message('Steady as she goes...') + # Do even more work... + await Actor.set_status_message('Almost there...') + # Finish the job + await Actor.set_status_message('Phew! That was not that hard!') diff --git a/docs/03_concepts/code/02_input.py b/docs/03_concepts/code/02_input.py new file mode 100644 index 00000000..b3bd3034 --- /dev/null +++ b/docs/03_concepts/code/02_input.py @@ -0,0 +1,9 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() or {} + first_number = actor_input.get('firstNumber', 0) + second_number = actor_input.get('secondNumber', 0) + Actor.log.info('Sum: %s', first_number + second_number) diff --git a/docs/03_concepts/code/03_dataset_exports.py b/docs/03_concepts/code/03_dataset_exports.py new file mode 100644 index 00000000..78f0f5b9 --- /dev/null +++ b/docs/03_concepts/code/03_dataset_exports.py @@ -0,0 +1,31 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a dataset and write some data in it + dataset = await Actor.open_dataset(name='my-cool-dataset') + await dataset.push_data([{'itemNo': i} for i in range(1000)]) + + # Export the data as CSV + await dataset.export_to( + content_type='csv', + key='data.csv', + to_key_value_store_name='my-cool-key-value-store', + ) + + # Export the data as JSON + await dataset.export_to( + content_type='json', + key='data.json', + to_key_value_store_name='my-cool-key-value-store', + ) + + # Print the exported records + store = await Actor.open_key_value_store(name='my-cool-key-value-store') + + csv_data = await store.get_value('data.csv') + Actor.log.info(f'CSV data: {csv_data}') + + json_data = await store.get_value('data.json') + Actor.log.info(f'JSON data: {json_data}') diff --git a/docs/03_concepts/code/03_dataset_read_write.py b/docs/03_concepts/code/03_dataset_read_write.py new file mode 100644 index 00000000..6d8ac7f0 --- /dev/null +++ b/docs/03_concepts/code/03_dataset_read_write.py @@ -0,0 +1,16 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a dataset and write some data in it + dataset = await Actor.open_dataset(name='my-cool-dataset') + await dataset.push_data([{'itemNo': i} for i in range(1000)]) + + # Read back the first half of the data + first_half = await dataset.get_data(limit=500) + Actor.log.info(f'The first half of items = {first_half.items}') + + # Iterate over the second half + second_half = [item async for item in dataset.iterate_items(offset=500)] + Actor.log.info(f'The second half of items = {second_half}') diff --git a/docs/03_concepts/code/03_deleting_storages.py b/docs/03_concepts/code/03_deleting_storages.py new file mode 100644 index 00000000..68925bd9 --- /dev/null +++ b/docs/03_concepts/code/03_deleting_storages.py @@ -0,0 +1,13 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a key-value store with the name 'my-cool-store' + key_value_store = await Actor.open_key_value_store(name='my-cool-store') + await key_value_store.set_value('record', 'Hello, world!') + + # Do something ... + + # Now we don't want it anymore + await key_value_store.drop() diff --git a/docs/03_concepts/code/03_kvs_iterating.py b/docs/03_concepts/code/03_kvs_iterating.py new file mode 100644 index 00000000..f5944095 --- /dev/null +++ b/docs/03_concepts/code/03_kvs_iterating.py @@ -0,0 +1,18 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a named key-value store + kvs = await Actor.open_key_value_store(name='my-cool-key-value-store') + + # Write some data to it + await kvs.set_value('automatic_text', 'abcd') + await kvs.set_value('automatic_json', {'ab': 'cd'}) + await kvs.set_value('explicit_csv', 'a,b\nc,d', content_type='text/csv') + + # Print the info for each record + Actor.log.info('Records in store:') + + async for key, info in kvs.iterate_keys(): + Actor.log.info(f'key={key}, info={info}') diff --git a/docs/03_concepts/code/03_kvs_public_url.py b/docs/03_concepts/code/03_kvs_public_url.py new file mode 100644 index 00000000..fe8ae07a --- /dev/null +++ b/docs/03_concepts/code/03_kvs_public_url.py @@ -0,0 +1,11 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a named key-value store + store = await Actor.open_key_value_store(name='my-cool-key-value-store') + + # Get the public URL of a record + my_record_url = await store.get_public_url('my_record') + Actor.log.info(f'URL of "my_record": {my_record_url}') diff --git a/docs/03_concepts/code/03_kvs_read_write.py b/docs/03_concepts/code/03_kvs_read_write.py new file mode 100644 index 00000000..239aa2e2 --- /dev/null +++ b/docs/03_concepts/code/03_kvs_read_write.py @@ -0,0 +1,25 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Open a named key-value store + kvs = await Actor.open_key_value_store(name='my-cool-key-value-store') + + # Write some data to it + await kvs.set_value('automatic_text', 'abcd') + await kvs.set_value('automatic_json', {'ab': 'cd'}) + await kvs.set_value('explicit_csv', 'a,b\nc,d', content_type='text/csv') + + # Get the values and log them + automatic_text = await kvs.get_value('automatic_text') + Actor.log.info(f'Automatic text: {automatic_text}') + + automatic_json = await kvs.get_value('automatic_json') + Actor.log.info(f'Automatic JSON: {automatic_json}') + + explicit_csv = await kvs.get_value('explicit_csv') + Actor.log.info(f'Explicit CSV: {explicit_csv}') + + # Delete the `automatic_text` value + await kvs.set_value('automatic_text', None) diff --git a/docs/03_concepts/code/03_opening_storages.py b/docs/03_concepts/code/03_opening_storages.py new file mode 100644 index 00000000..b4ccbd09 --- /dev/null +++ b/docs/03_concepts/code/03_opening_storages.py @@ -0,0 +1,16 @@ +from apify import Actor, Request + + +async def main() -> None: + async with Actor: + # Work with the default dataset of the Actor + dataset = await Actor.open_dataset() + await dataset.push_data({'result': 'Hello, world!'}) + + # Work with the key-value store with ID 'mIJVZsRQrDQf4rUAf' + key_value_store = await Actor.open_key_value_store(id='mIJVZsRQrDQf4rUAf') + await key_value_store.set_value('record', 'Hello, world!') + + # Work with the request queue with the name 'my-queue' + request_queue = await Actor.open_request_queue(name='my-queue') + await request_queue.add_request(Request.from_url('https://apify.com')) diff --git a/docs/03_concepts/code/03_rq.py b/docs/03_concepts/code/03_rq.py new file mode 100644 index 00000000..ba6a9570 --- /dev/null +++ b/docs/03_concepts/code/03_rq.py @@ -0,0 +1,50 @@ +import asyncio +import random + +from apify import Actor, Request + +FAILURE_RATE = 0.3 + + +async def main() -> None: + async with Actor: + # Open the queue + queue = await Actor.open_request_queue() + + # Add some requests to the queue + for i in range(1, 10): + await queue.add_request(Request.from_url(f'http://example.com/{i}')) + + # Add a request to the start of the queue, for priority processing + await queue.add_request(Request.from_url('http://example.com/0'), forefront=True) + + # If you try to add an existing request again, it will not do anything + add_request_info = await queue.add_request(Request.from_url('http://different-example.com/5')) + Actor.log.info(f'Add request info: {add_request_info}') + + processed_request = await queue.get_request(add_request_info.id) + Actor.log.info(f'Processed request: {processed_request}') + + # Finally, process the queue until all requests are handled + while not await queue.is_finished(): + # Fetch the next unhandled request in the queue + request = await queue.fetch_next_request() + # This can happen due to the eventual consistency of the underlying request queue storage, + # best solution is just to sleep a bit + if request is None: + await asyncio.sleep(1) + continue + + Actor.log.info(f'Processing request {request.unique_key}...') + Actor.log.info(f'Scraping URL {request.url}...') + + # Do some fake work, which fails 30% of the time + await asyncio.sleep(1) + if random.random() > FAILURE_RATE: + # If processing the request was successful, mark it as handled + Actor.log.info('Request successful.') + await queue.mark_request_as_handled(request) + else: + # If processing the request was unsuccessful, reclaim it so it can be processed again + Actor.log.warning('Request failed, will retry!') + await queue.reclaim_request(request) diff --git a/docs/03_concepts/code/04_actor_events.py b/docs/03_concepts/code/04_actor_events.py new file mode 100644 index 00000000..1c8c785d --- /dev/null +++ b/docs/03_concepts/code/04_actor_events.py @@ -0,0 +1,38 @@ +import asyncio +from typing import Any + +from apify import Actor, Event + + +async def main() -> None: + async with Actor: + total_items = 1000 + + # Load the state if it's saved from some previous execution + processed_items = 0 + actor_state = await Actor.get_value('STATE') + if actor_state is not None: + processed_items = actor_state + + # Save the state when the `PERSIST_STATE` event happens + async def save_state(event_data: Any) -> None: + nonlocal processed_items + Actor.log.info('Saving Actor state', extra=event_data) + await Actor.set_value('STATE', processed_items) + + Actor.on(Event.PERSIST_STATE, save_state) + + # Do some fake work + for i in range(processed_items, total_items): + Actor.log.info(f'Processing item {i}...') + processed_items = i + await asyncio.sleep(0.1) + + # Suppose we can stop saving the state now + Actor.off(Event.PERSIST_STATE, save_state) + + # Do some more fake work, this time something that can't be restarted, + # so no point persisting the state + for j in range(10): + Actor.log.info(f'Processing item {j} of another kind...') + await asyncio.sleep(1) diff --git a/docs/03_concepts/code/05_apify_proxy.py b/docs/03_concepts/code/05_apify_proxy.py new file mode 100644 index 00000000..96e7104c --- /dev/null +++ b/docs/03_concepts/code/05_apify_proxy.py @@ -0,0 +1,12 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration() + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Using proxy URL: {proxy_url}') diff --git a/docs/03_concepts/code/05_apify_proxy_config.py b/docs/03_concepts/code/05_apify_proxy_config.py new file mode 100644 index 00000000..ba078b35 --- /dev/null +++ b/docs/03_concepts/code/05_apify_proxy_config.py @@ -0,0 +1,15 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + groups=['RESIDENTIAL'], + country_code='US', + ) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Proxy URL: {proxy_url}') diff --git a/docs/03_concepts/code/05_custom_proxy.py b/docs/03_concepts/code/05_custom_proxy.py new file mode 100644 index 00000000..d4c8a24a --- /dev/null +++ b/docs/03_concepts/code/05_custom_proxy.py @@ -0,0 +1,17 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + proxy_urls=[ + 'http://proxy-1.com', + 'http://proxy-2.com', + ], + ) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Using proxy URL: {proxy_url}') diff --git a/docs/03_concepts/code/05_custom_proxy_function.py b/docs/03_concepts/code/05_custom_proxy_function.py new file mode 100644 index 00000000..71aced2a --- /dev/null +++ b/docs/03_concepts/code/05_custom_proxy_function.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from apify import Actor, Request + + +async def custom_new_url_function( + session_id: str | None = None, + _: Request | None = None, +) -> str | None: + if session_id is not None: + return f'http://my-custom-proxy-supporting-sessions.com?session-id={session_id}' + return 'http://my-custom-proxy-not-supporting-sessions.com' + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + new_url_function=custom_new_url_function, # type: ignore[arg-type] + ) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url_with_session = await proxy_configuration.new_url('a') + Actor.log.info(f'Using proxy URL: {proxy_url_with_session}') + + proxy_url_without_session = await proxy_configuration.new_url() + Actor.log.info(f'Using proxy URL: {proxy_url_without_session}') diff --git a/docs/03_concepts/code/05_proxy_actor_input.py b/docs/03_concepts/code/05_proxy_actor_input.py new file mode 100644 index 00000000..3a69ea0a --- /dev/null +++ b/docs/03_concepts/code/05_proxy_actor_input.py @@ -0,0 +1,14 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() or {} + proxy_settings = actor_input.get('proxySettings') + proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() + Actor.log.info(f'Using proxy URL: {proxy_url}') diff --git a/docs/03_concepts/code/05_proxy_httpx.py b/docs/03_concepts/code/05_proxy_httpx.py new file mode 100644 index 00000000..a124d1a5 --- /dev/null +++ b/docs/03_concepts/code/05_proxy_httpx.py @@ -0,0 +1,22 @@ +import httpx + +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + proxy_urls=[ + 'http://proxy-1.com', + 'http://proxy-2.com', + ], + ) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() + + async with httpx.AsyncClient(proxy=proxy_url) as httpx_client: + response = await httpx_client.get('http://example.com') + Actor.log.info(f'Response: {response}') diff --git a/docs/03_concepts/code/05_proxy_rotation.py b/docs/03_concepts/code/05_proxy_rotation.py new file mode 100644 index 00000000..c816dabf --- /dev/null +++ b/docs/03_concepts/code/05_proxy_rotation.py @@ -0,0 +1,23 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + proxy_configuration = await Actor.create_proxy_configuration( + proxy_urls=[ + 'http://proxy-1.com', + 'http://proxy-2.com', + ], + ) + + if not proxy_configuration: + raise RuntimeError('No proxy configuration available.') + + proxy_url = await proxy_configuration.new_url() # http://proxy-1.com + proxy_url = await proxy_configuration.new_url() # http://proxy-2.com + proxy_url = await proxy_configuration.new_url() # http://proxy-1.com + proxy_url = await proxy_configuration.new_url() # http://proxy-2.com + proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com + proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com + proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com + proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com diff --git a/docs/03_concepts/code/06_interacting_call.py b/docs/03_concepts/code/06_interacting_call.py new file mode 100644 index 00000000..46a0a90a --- /dev/null +++ b/docs/03_concepts/code/06_interacting_call.py @@ -0,0 +1,22 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Start the apify/screenshot-url Actor. + actor_run = await Actor.call( + actor_id='apify/screenshot-url', + run_input={'url': 'http://example.com', 'delay': 10000}, + ) + + if actor_run is None: + raise RuntimeError('Actor task failed to start.') + + # Wait for the Actor run to finish. + run_client = Actor.apify_client.run(actor_run.id) + await run_client.wait_for_finish() + + # Get the Actor output from the key-value store. + kvs_client = run_client.key_value_store() + output = await kvs_client.get_record('OUTPUT') + Actor.log.info(f'Actor output: {output}') diff --git a/docs/03_concepts/code/06_interacting_call_task.py b/docs/03_concepts/code/06_interacting_call_task.py new file mode 100644 index 00000000..75335d69 --- /dev/null +++ b/docs/03_concepts/code/06_interacting_call_task.py @@ -0,0 +1,19 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Start the Actor task by its ID. + actor_run = await Actor.call_task(task_id='Z3m6FPSj0GYZ25rQc') + + if actor_run is None: + raise RuntimeError('Actor task failed to start.') + + # Wait for the task run to finish. + run_client = Actor.apify_client.run(actor_run.id) + await run_client.wait_for_finish() + + # Get the task run dataset items + dataset_client = run_client.dataset() + items = await dataset_client.list_items() + Actor.log.info(f'Task run dataset items: {items}') diff --git a/docs/03_concepts/code/06_interacting_metamorph.py b/docs/03_concepts/code/06_interacting_metamorph.py new file mode 100644 index 00000000..53d48882 --- /dev/null +++ b/docs/03_concepts/code/06_interacting_metamorph.py @@ -0,0 +1,24 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Get the original Actor input. + actor_input = await Actor.get_input() or {} + hotel_url = actor_input.get('hotel_url') + + # Create new input for apify/web-scraper Actor. + web_scraper_input = { + 'startUrls': [{'url': hotel_url}], + 'pageFunction': """async function pageFunction(context) { + // Here you pass the JavaScript page function + // that scrapes all the reviews from the hotel's URL + }""", + } + + # Metamorph the Actor run to `apify/web-scraper` with the new input. + await Actor.metamorph('apify/web-scraper', web_scraper_input) + + # This code will not be called, since the `metamorph` action terminates + # the current Actor run container. + Actor.log.info('You will not see this!') diff --git a/docs/03_concepts/code/06_interacting_start.py b/docs/03_concepts/code/06_interacting_start.py new file mode 100644 index 00000000..075347c2 --- /dev/null +++ b/docs/03_concepts/code/06_interacting_start.py @@ -0,0 +1,13 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Start your own Actor named 'my-fancy-actor'. + actor_run = await Actor.start( + actor_id='~my-fancy-actor', + run_input={'foo': 'bar'}, + ) + + # Log the Actor run ID. + Actor.log.info(f'Actor run ID: {actor_run.id}') diff --git a/docs/03_concepts/code/07_webhook.py b/docs/03_concepts/code/07_webhook.py new file mode 100644 index 00000000..c2e382cf --- /dev/null +++ b/docs/03_concepts/code/07_webhook.py @@ -0,0 +1,16 @@ +from apify import Actor, Webhook + + +async def main() -> None: + async with Actor: + # Create a webhook that will be triggered when the Actor run fails. + webhook = Webhook( + event_types=['ACTOR.RUN.FAILED'], + request_url='https://example.com/run-failed', + ) + + # Add the webhook to the Actor. + await Actor.add_webhook(webhook) + + # Raise an error to simulate a failed run. + raise RuntimeError('I am an error and I know it!') diff --git a/docs/03_concepts/code/07_webhook_preventing.py b/docs/03_concepts/code/07_webhook_preventing.py new file mode 100644 index 00000000..988c531c --- /dev/null +++ b/docs/03_concepts/code/07_webhook_preventing.py @@ -0,0 +1,17 @@ +from apify import Actor, Webhook + + +async def main() -> None: + async with Actor: + # Create a webhook that will be triggered when the Actor run fails. + webhook = Webhook( + event_types=['ACTOR.RUN.FAILED'], + request_url='https://example.com/run-failed', + idempotency_key=Actor.config.actor_run_id, + ) + + # Add the webhook to the Actor. + await Actor.add_webhook(webhook) + + # Raise an error to simulate a failed run. + raise RuntimeError('I am an error and I know it!') diff --git a/docs/03_concepts/code/08_actor_client.py b/docs/03_concepts/code/08_actor_client.py new file mode 100644 index 00000000..68f5c2d7 --- /dev/null +++ b/docs/03_concepts/code/08_actor_client.py @@ -0,0 +1,11 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + # Create a new user client. + user_client = Actor.apify_client.user('me') + + # Get information about the current user. + me = await user_client.get() + Actor.log.info(f'User: {me}') diff --git a/docs/03_concepts/code/08_actor_new_client.py b/docs/03_concepts/code/08_actor_new_client.py new file mode 100644 index 00000000..da59e6fc --- /dev/null +++ b/docs/03_concepts/code/08_actor_new_client.py @@ -0,0 +1,14 @@ +from apify import Actor + +TOKEN = 'ANOTHER_USERS_TOKEN' + + +async def main() -> None: + async with Actor: + # Create a new user client with a custom token. + apify_client = Actor.new_client(token=TOKEN, max_retries=2) + user_client = apify_client.user('me') + + # Get information about the another user. + them = await user_client.get() + Actor.log.info(f'Another user: {them}') diff --git a/docs/03_concepts/code/09_webserver.py b/docs/03_concepts/code/09_webserver.py new file mode 100644 index 00000000..de6d953d --- /dev/null +++ b/docs/03_concepts/code/09_webserver.py @@ -0,0 +1,47 @@ +import asyncio +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +from apify import Actor + +processed_items = 0 +http_server = None + + +# Just a simple handler that will print the number of processed items so far +# on every GET request. +class RequestHandler(BaseHTTPRequestHandler): + def do_get(self) -> None: + self.log_request() + self.send_response(200) + self.end_headers() + self.wfile.write(bytes(f'Processed items: {processed_items}', encoding='utf-8')) + + +def run_server() -> None: + # Start the HTTP server on the provided port, + # and save a reference to the server. + global http_server + with ThreadingHTTPServer(('', Actor.config.web_server_port), RequestHandler) as server: + Actor.log.info(f'Server running on {Actor.config.web_server_port}') + http_server = server + server.serve_forever() + + +async def main() -> None: + global processed_items + async with Actor: + # Start the HTTP server in a separate thread. + run_server_task = asyncio.get_running_loop().run_in_executor(None, run_server) + + # Simulate doing some work. + for _ in range(100): + await asyncio.sleep(1) + processed_items += 1 + Actor.log.info(f'Processed items: {processed_items}') + + if http_server is None: + raise RuntimeError('HTTP server not started') + + # Signal the HTTP server to shut down, and wait for it to finish. + http_server.shutdown() + await run_server_task diff --git a/docs/03_concepts/code/10_log_config.py b/docs/03_concepts/code/10_log_config.py new file mode 100644 index 00000000..520df753 --- /dev/null +++ b/docs/03_concepts/code/10_log_config.py @@ -0,0 +1,12 @@ +import logging + +from apify.log import ActorLogFormatter + + +async def main() -> None: + handler = logging.StreamHandler() + handler.setFormatter(ActorLogFormatter()) + + apify_logger = logging.getLogger('apify') + apify_logger.setLevel(logging.DEBUG) + apify_logger.addHandler(handler) diff --git a/docs/03_concepts/code/10_logger_usage.py b/docs/03_concepts/code/10_logger_usage.py new file mode 100644 index 00000000..a707ab5c --- /dev/null +++ b/docs/03_concepts/code/10_logger_usage.py @@ -0,0 +1,23 @@ +import logging + +from apify import Actor +from apify.log import ActorLogFormatter + + +async def main() -> None: + handler = logging.StreamHandler() + handler.setFormatter(ActorLogFormatter()) + + apify_logger = logging.getLogger('apify') + apify_logger.setLevel(logging.DEBUG) + apify_logger.addHandler(handler) + + async with Actor: + Actor.log.debug('This is a debug message') + Actor.log.info('This is an info message') + Actor.log.warning('This is a warning message', extra={'reason': 'Bad Actor!'}) + Actor.log.error('This is an error message') + try: + raise RuntimeError('Ouch!') + except RuntimeError: + Actor.log.exception('This is an exceptional message') diff --git a/docs/03_concepts/code/11_config.py b/docs/03_concepts/code/11_config.py new file mode 100644 index 00000000..10b07079 --- /dev/null +++ b/docs/03_concepts/code/11_config.py @@ -0,0 +1,16 @@ +from datetime import timedelta + +from apify import Actor, Configuration, Event + + +async def main() -> None: + global_config = Configuration.get_global_configuration() + global_config.persist_state_interval = timedelta(seconds=10) + + async with Actor: + # Define a handler that will be called for every persist state event. + async def save_state() -> None: + await Actor.set_value('STATE', 'Hello, world!') + + # The save_state handler will be called every 10 seconds now. + Actor.on(Event.PERSIST_STATE, save_state) diff --git a/docs/04-upgrading/upgrading_to_v2.md b/docs/04_upgrading/upgrading_to_v2.md similarity index 100% rename from docs/04-upgrading/upgrading_to_v2.md rename to docs/04_upgrading/upgrading_to_v2.md diff --git a/pyproject.toml b/pyproject.toml index dc243de9..ee6bb867 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ scrapy = ["scrapy"] [tool.ruff] line-length = 120 +include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] [tool.ruff.lint] select = ["ALL"] @@ -128,10 +129,12 @@ indent-style = "space" "TRY301", # Abstract `raise` to an inner function "TID252", # Prefer absolute imports over relative imports from parent modules ] -"**/{docs}/**" = [ - "D", # Everything from the pydocstyle - "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py - "F841", # Local variable {variable} is assigned to but never used +"**/{docs,website}/**" = [ + "D", # Everything from the pydocstyle + "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py + "F841", # Local variable {variable} is assigned to but never used + "TRY301", # Abstract `raise` to an inner function + "PLW0603", # Using the global statement to update `{name}` is discouraged ] [tool.ruff.lint.flake8-quotes] @@ -166,7 +169,7 @@ timeout = 1200 [tool.mypy] python_version = "3.9" plugins = ["pydantic.mypy"] -files = ["src", "tests"] +files = ["src", "tests", "docs", "website"] check_untyped_defs = true disallow_incomplete_defs = true disallow_untyped_calls = true @@ -180,13 +183,13 @@ warn_unused_ignores = true exclude = [] [[tool.mypy.overrides]] -module = ['scrapy', 'scrapy.*', 'lazy_object_proxy'] +module = ['scrapy', 'scrapy.*', 'lazy_object_proxy', 'nest_asyncio'] ignore_missing_imports = true [tool.basedpyright] pythonVersion = "3.9" typeCheckingMode = "standard" -include = ["src", "tests"] +include = ["src", "tests", "docs", "website"] [tool.coverage.report] exclude_lines = [ diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 9292cf54..87a4dfb3 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -127,7 +127,7 @@ def patch( # Try to get the return type of the client method using `typing.get_type_hints()` client_method_return_type = get_type_hints(client_method)['return'] except TypeError: - # There is a known issue with `typing.get_type_hints()` on Python 3.8 and 3.9. It raises a `TypeError` + # There is a known issue with `typing.get_type_hints()` on Python 3.9. It raises a `TypeError` # when `|` (Union) is used in the type hint, even with `from __future__ import annotations`. Since we # only need the return type, we attempt the following workaround. @@ -140,7 +140,7 @@ def patch( # 3. Try to get the return type again using `typing.get_type_hints()` client_method_return_type = get_type_hints(client_method_copied)['return'] - # TODO: Remove this fallback once we drop support for Python 3.8 and 3.9 + # TODO: Remove this fallback once we drop support for Python 3.9 # https://github.com/apify/apify-sdk-python/issues/151 original_submethod = getattr(client_method_return_type, submethod, None) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 52519c7e..c09cf866 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -117,6 +117,10 @@ module.exports = { url: 'https://crawlee.dev/python/api/class/RequestQueue', group: 'Classes', }, + { + url: 'https://crawlee.dev/python/api/class/Request', + group: 'Classes', + }, ], }, ], diff --git a/website/generate_module_shortcuts.py b/website/generate_module_shortcuts.py index f671ea9e..18516ef5 100755 --- a/website/generate_module_shortcuts.py +++ b/website/generate_module_shortcuts.py @@ -1,18 +1,26 @@ #!/usr/bin/env python3 +from __future__ import annotations + import importlib import inspect import json +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from types import ModuleType -def get_module_shortcuts(module, parent_classes=None): - """Traverse a module and its submodules, and if some class is present in both a module and its submodule, register a shortcut.""" +def get_module_shortcuts(module: ModuleType, parent_classes: list | None = None) -> dict: + """Traverse a module and its submodules to identify and register shortcuts for classes.""" shortcuts = {} if parent_classes is None: parent_classes = [] + parent_module_name = '.'.join(module.__name__.split('.')[:-1]) module_classes = [] + for classname, cls in inspect.getmembers(module, inspect.isclass): module_classes.append(cls) if cls in parent_classes: @@ -25,16 +33,15 @@ def get_module_shortcuts(module, parent_classes=None): return shortcuts -def resolve_shortcuts(shortcuts): +def resolve_shortcuts(shortcuts: dict) -> None: """Resolve linked shortcuts. - For example, if there are shortcuts A -> B and B -> C, - resolve them to A -> C. + For example, if there are shortcuts A -> B and B -> C, resolve them to A -> C. """ for source, target in shortcuts.items(): while target in shortcuts: shortcuts[source] = shortcuts[target] - target = shortcuts[target] + target = shortcuts[target] # noqa: PLW2901 shortcuts = {} @@ -43,7 +50,7 @@ def resolve_shortcuts(shortcuts): module = importlib.import_module(module_name) module_shortcuts = get_module_shortcuts(module) shortcuts.update(module_shortcuts) - except ModuleNotFoundError: + except ModuleNotFoundError: # noqa: PERF203 pass resolve_shortcuts(shortcuts) diff --git a/website/sidebars.js b/website/sidebars.js index 06c17b57..c25d52cf 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -7,7 +7,7 @@ module.exports = { items: [ { type: 'autogenerated', - dirName: '01-overview', + dirName: '01_overview', }, ], }, @@ -18,18 +18,18 @@ module.exports = { items: [ { type: 'autogenerated', - dirName: '02-guides', + dirName: '02_guides', }, ], }, { type: 'category', - label: 'Usage concepts', + label: 'Concepts', collapsed: false, items: [ { type: 'autogenerated', - dirName: '03-concepts', + dirName: '03_concepts', }, ], }, @@ -40,7 +40,7 @@ module.exports = { items: [ { type: 'autogenerated', - dirName: '04-upgrading', + dirName: '04_upgrading', }, ], }, diff --git a/website/src/pages/home_page_example.py b/website/src/pages/home_page_example.py new file mode 100644 index 00000000..c5441d61 --- /dev/null +++ b/website/src/pages/home_page_example.py @@ -0,0 +1,14 @@ +import httpx +from bs4 import BeautifulSoup + +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() + async with httpx.AsyncClient() as client: + response = await client.get(actor_input['url']) + soup = BeautifulSoup(response.content, 'html.parser') + data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None} + await Actor.push_data(data) diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 8877d47a..a8c46f35 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -7,6 +7,8 @@ import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import useBaseUrl from '@docusaurus/useBaseUrl'; import styles from './index.module.css'; +import HomePageExample from '!!raw-loader!./home_page_example.py'; + function Hero() { return (
@@ -26,8 +28,7 @@ function Hero() {

- The Apify SDK for Python is the official library for creating Apify Actors in Python. - It provides useful features like Actor lifecycle management, local storage emulation, and Actor event handling. + The Apify SDK for Python is the official library for creating Apify Actors in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor event handling.

@@ -66,23 +67,11 @@ export default function Home() {

- For example, the Apify SDK makes it easy to read the Actor input with the Actor.get_input() method, - and to save scraped data from your Actors to a dataset - {' '}by simply using the Actor.push_data() method. + For example, the Apify SDK makes it easy to read the Actor input with the Actor.get_input() method, and to save scraped data from your Actors to a dataset by simply using the Actor.push_data() method.

- {`from apify import Actor -from bs4 import BeautifulSoup -import requests - -async def main(): - async with Actor: - actor_input = await Actor.get_input() - response = requests.get(actor_input['url']) - soup = BeautifulSoup(response.content, 'html.parser') - await Actor.push_data({ 'url': actor_input['url'], 'title': soup.title.string })` - } + {HomePageExample}
From 910cc83e3380631853d00c9b877f719d13697b8a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 22 Jan 2025 19:55:15 +0100 Subject: [PATCH 2/3] Fix mypy config --- pyproject.toml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ee6bb867..881fd9c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,7 +183,14 @@ warn_unused_ignores = true exclude = [] [[tool.mypy.overrides]] -module = ['scrapy', 'scrapy.*', 'lazy_object_proxy', 'nest_asyncio'] +module = [ + 'bs4', + 'lazy_object_proxy', + 'nest_asyncio', + 'playwright.*', + 'scrapy.*', + 'selenium.*', +] ignore_missing_imports = true [tool.basedpyright] From b0437213916b03e8d791cbb472c316f436f1bf00 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 23 Jan 2025 10:28:31 +0100 Subject: [PATCH 3/3] makefile docs command update --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d77cbc9b..65db097e 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ build-api-reference: cd website && poetry run ./build_api_reference.sh build-docs: - cd website && npm clean-install && npm run build + cd website && poetry run npm clean-install && poetry run npm run build run-docs: build-api-reference - cd website && npm clean-install && npm run start + cd website && poetry run npm clean-install && poetry run npm run start