From fb15f5b58e3d3cc0e7c60d770975cd57d3ec132c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 10 Nov 2025 18:42:29 +0100 Subject: [PATCH 1/2] docs: Update Playwright home page example --- website/src/pages/home_page_example.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/website/src/pages/home_page_example.py b/website/src/pages/home_page_example.py index 60456028d1..16e36d3b5f 100644 --- a/website/src/pages/home_page_example.py +++ b/website/src/pages/home_page_example.py @@ -1,11 +1,13 @@ +import asyncio + from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=10, # Limit the max requests per crawl. - headless=False, # Show the browser window. - browser_type='firefox', # Use the Firefox browser. + headless=True, # Run in headless mode (set to False to see the browser). + browser_type='firefox', # Use Firefox browser. ) # Define the default request handler, which will be called for every request. @@ -13,25 +15,28 @@ async def main() -> None: async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') - # Extract and enqueue all links found on the page. - await context.enqueue_links() - # Extract data from the page using Playwright API. data = { 'url': context.request.url, 'title': await context.page.title(), - 'content': (await context.page.content())[:100], } # Push the extracted data to the default dataset. await context.push_data(data) + # Extract all links on the page and enqueue them. + await context.enqueue_links() + # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) - # Export the entire dataset to a JSON file. - await crawler.export_data('results.json') + # Export the entire dataset to a CSV file. + await crawler.export_data('results.csv') - # Or work with the data directly. + # Or access the data directly. data = await crawler.get_data() crawler.log.info(f'Extracted data: {data.items}') + + +if __name__ == '__main__': + asyncio.run(main()) From 047b2c5d4160a5ba8ac031ad503ac523bb8f9192 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 10 Nov 2025 19:35:58 +0100 Subject: [PATCH 2/2] minor updates regarding running docs code examples --- .github/workflows/run_code_checks.yaml | 1 + Makefile | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml index 9bd2924f2a..ca0493882f 100644 --- a/.github/workflows/run_code_checks.yaml +++ b/.github/workflows/run_code_checks.yaml @@ -40,3 +40,4 @@ jobs: docs_check: name: Docs check uses: apify/workflows/.github/workflows/python_docs_check.yaml@main + secrets: inherit diff --git a/Makefile b/Makefile index 98024112d2..6ca17970b0 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,6 @@ # This is default for local testing, but GitHub workflows override it to a higher value in CI E2E_TESTS_CONCURRENCY = 1 -# Placeholder token; replace with a real one for local docs testing if needed -APIFY_TOKEN = apify_api_token_placeholder - clean: rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage @@ -58,4 +55,4 @@ build-docs: cd website && corepack enable && yarn && uv run yarn build run-docs: build-api-reference - export APIFY_SIGNING_TOKEN=$(APIFY_TOKEN) && cd website && corepack enable && yarn && uv run yarn start + cd website && corepack enable && yarn && uv run yarn start