Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/run_code_checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ jobs:
docs_check:
name: Docs check
uses: apify/workflows/.github/workflows/python_docs_check.yaml@main
secrets: inherit
5 changes: 1 addition & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
# This is default for local testing, but GitHub workflows override it to a higher value in CI
E2E_TESTS_CONCURRENCY = 1

# Placeholder token; replace with a real one for local docs testing if needed
APIFY_TOKEN = apify_api_token_placeholder

clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage

Expand Down Expand Up @@ -58,4 +55,4 @@ build-docs:
cd website && corepack enable && yarn && uv run yarn build

run-docs: build-api-reference
export APIFY_SIGNING_TOKEN=$(APIFY_TOKEN) && cd website && corepack enable && yarn && uv run yarn start
cd website && corepack enable && yarn && uv run yarn start
23 changes: 14 additions & 9 deletions website/src/pages/home_page_example.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=10, # Limit the max requests per crawl.
headless=False, # Show the browser window.
browser_type='firefox', # Use the Firefox browser.
headless=True, # Run in headless mode (set to False to see the browser).
browser_type='firefox', # Use Firefox browser.
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract and enqueue all links found on the page.
await context.enqueue_links()

# Extract data from the page using Playwright API.
data = {
'url': context.request.url,
'title': await context.page.title(),
'content': (await context.page.content())[:100],
}

# Push the extracted data to the default dataset.
await context.push_data(data)

# Extract all links on the page and enqueue them.
await context.enqueue_links()

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a JSON file.
await crawler.export_data('results.json')
# Export the entire dataset to a CSV file.
await crawler.export_data('results.csv')

# Or work with the data directly.
# Or access the data directly.
data = await crawler.get_data()
crawler.log.info(f'Extracted data: {data.items}')


if __name__ == '__main__':
asyncio.run(main())