Skip to content

Commit 4801686

Browse files
committed
update to current master branch
2 parents fa14b58 + b9e7cb3 commit 4801686

File tree

216 files changed

+8756
-4120
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

216 files changed

+8756
-4120
lines changed

.github/workflows/pre_release.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ on:
88
tags-ignore:
99
- "**" # Ignore all tags to prevent duplicate builds when tags are pushed.
1010

11-
concurrency:
11+
# Or it can be triggered manually.
12+
workflow_dispatch:
13+
14+
concurrency:
1215
group: release
1316
cancel-in-progress: false
1417

1518
jobs:
1619
release_metadata:
17-
if: "!startsWith(github.event.head_commit.message, 'docs') && !startsWith(github.event.head_commit.message, 'ci') && startsWith(github.repository, 'apify/')"
1820
name: Prepare release metadata
1921
runs-on: ubuntu-latest
2022
outputs:
@@ -60,6 +62,7 @@ jobs:
6062
secrets: inherit
6163

6264
publish_to_pypi:
65+
if: "!startsWith(github.event.head_commit.message, 'ci') && !startsWith(github.event.head_commit.message, 'docs')"
6366
name: Publish to PyPI
6467
needs: [release_metadata, update_changelog]
6568
runs-on: ubuntu-latest
@@ -72,7 +75,7 @@ jobs:
7275
steps:
7376
- name: Prepare distribution
7477
uses: apify/workflows/prepare-pypi-distribution@main
75-
with:
78+
with:
7679
package_name: crawlee
7780
is_prerelease: "yes"
7881
version_number: ${{ needs.release_metadata.outputs.version_number }}

.github/workflows/release.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,26 @@ jobs:
4343
custom_version: ${{ inputs.custom_version }}
4444
existing_changelog_path: CHANGELOG.md
4545

46-
# If github.ref points to a [ci skip] commit, we assume that it was added by the pre_release workflow,
46+
# If github.ref points to a [skip ci] commit, we assume that it was added by the pre_release workflow,
4747
# which doesn't push the commit if code checks don't pass.
4848
# Otherwise, the checks will have been triggered by the `run_code_checks` workflow.
4949
wait_for_checks:
5050
name: Wait for code checks to pass
5151
runs-on: ubuntu-latest
5252
steps:
53+
- name: Checkout repository
54+
uses: actions/checkout@v4
55+
- name: Check if the head commit contains [skip ci]
56+
id: check_skip
57+
run: |
58+
if git log --format=%B -n 1 ${{ github.sha }} | head -n 1 | grep '\[skip ci\]$'; then
59+
echo 'skipped=true' >> $GITHUB_OUTPUT
60+
else
61+
echo 'skipped=false' >> $GITHUB_OUTPUT
62+
fi
63+
5364
- uses: lewagon/[email protected]
65+
if: ${{ steps.check_skip.outputs.skipped == 'false' }}
5466
with:
5567
ref: ${{ github.ref }}
5668
repo-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_code_checks.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
name: Unit tests
3030
uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
3131
secrets:
32-
httpbin_url: ${{ secrets.APIFY_HTTPBIN_TOKEN && format('https://janbuchar--httpbin.apify.actor?token={0}', secrets.APIFY_HTTPBIN_TOKEN) || 'https://httpbin.org'}}
32+
httpbin_url: ${{ secrets.APIFY_HTTPBIN_TOKEN && format('https://httpbin.apify.actor?token={0}', secrets.APIFY_HTTPBIN_TOKEN) || 'https://httpbin.org'}}
3333

3434
docs_check:
3535
name: Docs check

CHANGELOG.md

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,75 @@
33
All notable changes to this project will be documented in this file.
44

55
<!-- git-cliff-unreleased-start -->
6-
## 0.5.0 - **not yet released**
6+
## 0.6.0 - **not yet released**
7+
8+
### 🚀 Features
9+
10+
- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549)
11+
- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel)
12+
- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60)
13+
14+
### 🐛 Bug Fixes
15+
16+
- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969)
17+
- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975)
18+
19+
### Refactor
20+
21+
- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek)
22+
- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek)
23+
24+
25+
<!-- git-cliff-unreleased-end -->
26+
## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05)
27+
28+
### 🚀 Features
29+
30+
- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus)
31+
32+
### 🐛 Bug Fixes
33+
34+
- Fix session managment with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus)
35+
- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951)
36+
- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955)
37+
- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus)
38+
39+
40+
## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31)
41+
42+
### 🚀 Features
43+
44+
- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891)
45+
- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848)
46+
- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank)
47+
- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894)
48+
- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda)
49+
- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel)
50+
51+
### 🐛 Bug Fixes
52+
53+
- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907)
54+
- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895)
55+
- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus)
56+
57+
58+
## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17)
59+
60+
### 🐛 Bug Fixes
61+
62+
- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856)
63+
- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887)
64+
- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670)
65+
66+
67+
## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07)
68+
69+
### 🐛 Bug Fixes
70+
71+
- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar)
72+
73+
74+
## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02)
775

876
### 🚀 Features
977

@@ -37,7 +105,6 @@ All notable changes to this project will be documented in this file.
37105
- [**breaking**] Update the crawlers &amp; storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764)
38106

39107

40-
<!-- git-cliff-unreleased-end -->
41108
## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06)
42109

43110
### 🚀 Features

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
.PHONY: clean install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov integration-tests format check-code build-api-reference run-docs
22

3-
DIRS_WITH_CODE = src tests docs
4-
53
# This is default for local testing, but GitHub workflows override it to a higher value in CI
64
INTEGRATION_TESTS_CONCURRENCY = 1
75

@@ -12,6 +10,7 @@ install-dev:
1210
uv sync --group dev --all-extras
1311
uv run pre-commit install
1412
uv run playwright install
13+
uv run python -m browserforge update
1514

1615
build:
1716
uv build -v

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,12 @@ We also have a TypeScript implementation of the Crawlee, which you can explore a
3838

3939
We recommend visiting the [Introduction tutorial](https://crawlee.dev/python/docs/introduction) in Crawlee documentation for more information.
4040

41-
Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI package. The core functionality is included in the base package, with additional features available as optional extras to minimize package size and dependencies. To install Crawlee with all features, run the following command:
41+
Crawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.
42+
43+
To install Crawlee with all features, run the following command:
4244

4345
```sh
44-
pip install 'crawlee[all]'
46+
python -m pip install 'crawlee[all]'
4547
```
4648

4749
Then, install the [Playwright](https://playwright.dev/) dependencies:
@@ -89,7 +91,7 @@ The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupC
8991
```python
9092
import asyncio
9193

92-
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
94+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
9395

9496

9597
async def main() -> None:
@@ -129,7 +131,7 @@ The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler
129131
```python
130132
import asyncio
131133

132-
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
134+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
133135

134136

135137
async def main() -> None:

docs/deployment/apify_platform.mdx

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ description: Apify platform - large-scale and high-performance web scraping
66

77
import ApiLink from '@site/src/components/ApiLink';
88

9-
import Tabs from '@theme/Tabs';
10-
import TabItem from '@theme/TabItem';
119
import CodeBlock from '@theme/CodeBlock';
1210

1311
import LogWithConfigExample from '!!raw-loader!./code/apify/log_with_config_example.py';
@@ -25,9 +23,21 @@ We do not test Crawlee in other cloud environments such as Lambda or on specific
2523

2624
:::
2725

26+
## Requirements
27+
28+
To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up).
29+
30+
Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation).
31+
32+
Finally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`:
33+
34+
```bash
35+
pip install apify
36+
```
37+
2838
## Logging into Apify platform from Crawlee
2939

30-
To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://github.com/apify/apify-cli) or with environment variables.
40+
To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables.
3141

3242
Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on.
3343

@@ -142,7 +152,7 @@ If you don't plan to force usage of the platform storages when running the Actor
142152
{/*
143153
### Getting public url of an item in the platform storage
144154
145-
If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify Platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.
155+
If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.
146156
147157
<CodeBlock language="python">
148158
{GetPublicUrlSource}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import asyncio
2+
3+
from playwright.async_api import Route
4+
5+
from crawlee.crawlers import (
6+
AdaptivePlaywrightCrawler,
7+
AdaptivePlaywrightCrawlingContext,
8+
AdaptivePlaywrightPreNavCrawlingContext,
9+
)
10+
11+
12+
async def main() -> None:
13+
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
14+
max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
15+
)
16+
17+
@crawler.router.handler(label='label')
18+
async def request_handler_for_label(
19+
context: AdaptivePlaywrightCrawlingContext,
20+
) -> None:
21+
# Do some processing using `page`
22+
some_locator = context.page.locator('div').first
23+
await some_locator.wait_for()
24+
# Do stuff with locator...
25+
context.log.info(f'Playwright processing of: {context.request.url} ...')
26+
27+
@crawler.router.default_handler
28+
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
29+
context.log.info(f'User handler processing: {context.request.url} ...')
30+
# Do some processing using `parsed_content`
31+
context.log.info(context.parsed_content.title)
32+
33+
# Find more links and enqueue them.
34+
await context.enqueue_links()
35+
await context.push_data({'Top crawler Url': context.request.url})
36+
37+
@crawler.pre_navigation_hook
38+
async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
39+
"""Hook executed both in static sub crawler and playwright sub crawler."""
40+
# Trying to access context.page in this hook would raise `AdaptiveContextError`
41+
# for pages crawled without playwright.
42+
context.log.info(f'pre navigation hook for: {context.request.url} ...')
43+
44+
@crawler.pre_navigation_hook(playwright_only=True)
45+
async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
46+
"""Hook executed only in playwright sub crawler."""
47+
48+
async def some_routing_function(route: Route) -> None:
49+
await route.continue_()
50+
51+
await context.page.route('*/**', some_routing_function)
52+
context.log.info(
53+
f'Playwright only pre navigation hook for: {context.request.url} ...'
54+
)
55+
56+
# Run the crawler with the initial list of URLs.
57+
await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
58+
59+
60+
if __name__ == '__main__':
61+
asyncio.run(main())

docs/examples/code/beautifulsoup_crawler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import asyncio
22
from datetime import timedelta
33

4-
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
4+
from crawlee.crawlers import (
5+
BasicCrawlingContext,
6+
BeautifulSoupCrawler,
7+
BeautifulSoupCrawlingContext,
8+
)
59

610

711
async def main() -> None:

0 commit comments

Comments
 (0)