Skip to content

Commit acdfd99

Browse files
committed
merge
2 parents 32e16a5 + 826adbf commit acdfd99

File tree

20 files changed

+829
-604
lines changed

20 files changed

+829
-604
lines changed

.github/workflows/run_code_checks.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ jobs:
3636
httpbin_url: ${{ secrets.APIFY_HTTPBIN_TOKEN && format('https://httpbin.apify.actor?token={0}', secrets.APIFY_HTTPBIN_TOKEN) || 'https://httpbin.org'}}
3737
with:
3838
python-versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
39+
os: '["ubuntu-latest", "windows-latest", "macos-latest"]'
3940

4041
docs_check:
4142
name: Docs check

CHANGELOG.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@
33
All notable changes to this project will be documented in this file.
44

55
<!-- git-cliff-unreleased-start -->
6-
## 1.1.2 - **not yet released**
6+
## 1.2.1 - **not yet released**
7+
8+
### 🐛 Bug Fixes
9+
10+
- Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602)
11+
- Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus)
12+
13+
14+
<!-- git-cliff-unreleased-end -->
15+
## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08)
716

817
### 🚀 Features
918

@@ -14,9 +23,9 @@ All notable changes to this project will be documented in this file.
1423

1524
- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)
1625
- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)
26+
- Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus)
1727

1828

19-
<!-- git-cliff-unreleased-end -->
2029
## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
2130

2231
### 🐛 Bug Fixes

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "crawlee"
7-
version = "1.1.2"
7+
version = "1.2.1"
88
description = "Crawlee for Python"
99
authors = [{ name = "Apify Technologies s.r.o.", email = "[email protected]" }]
1010
license = { file = "LICENSE" }

src/crawlee/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from importlib import metadata
22

3-
from ._request import Request, RequestOptions
3+
from ._request import Request, RequestOptions, RequestState
44
from ._service_locator import service_locator
55
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
66
from ._utils.globs import Glob
@@ -14,6 +14,7 @@
1414
'HttpHeaders',
1515
'Request',
1616
'RequestOptions',
17+
'RequestState',
1718
'RequestTransformAction',
1819
'SkippedReason',
1920
'service_locator',

src/crawlee/_request.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
3434
class CrawleeRequestData(BaseModel):
3535
"""Crawlee-specific configuration stored in the `user_data`."""
3636

37-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
3838
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
3939
`BasicCrawler`."""
4040

4141
enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
4242
"""The strategy that was used for enqueuing the request."""
4343

44-
state: RequestState | None = None
44+
state: RequestState = RequestState.UNPROCESSED
4545
"""Describes the request's current lifecycle state."""
4646

4747
session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137137
always_enqueue: NotRequired[bool]
138138
user_data: NotRequired[dict[str, JsonSerializable]]
139139
no_retry: NotRequired[bool]
140+
enqueue_strategy: NotRequired[EnqueueStrategy]
141+
max_retries: NotRequired[int | None]
140142

141143

142144
@docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166168

167169
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168170

169-
unique_key: Annotated[str, Field(alias='uniqueKey')]
171+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170172
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171173
to the same URL.
172174
@@ -178,17 +180,18 @@ class Request(BaseModel):
178180
and specify which URLs shall be considered equal.
179181
"""
180182

181-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182184
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183185
and fragments."""
184186

185-
method: HttpMethod = 'GET'
187+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186188
"""HTTP request method."""
187189

188190
payload: Annotated[
189191
HttpPayload | None,
190192
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191193
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194+
Field(frozen=True),
192195
] = None
193196
"""HTTP request payload."""
194197

@@ -250,6 +253,8 @@ def from_url(
250253
keep_url_fragment: bool = False,
251254
use_extended_unique_key: bool = False,
252255
always_enqueue: bool = False,
256+
enqueue_strategy: EnqueueStrategy | None = None,
257+
max_retries: int | None = None,
253258
**kwargs: Any,
254259
) -> Self:
255260
"""Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ def from_url(
277282
`unique_key` computation. This is only relevant when `unique_key` is not provided.
278283
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279284
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285+
enqueue_strategy: The strategy that will be used for enqueuing the request.
286+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287+
option of `BasicCrawler`.
280288
**kwargs: Additional request properties.
281289
"""
282290
if unique_key is not None and always_enqueue:
@@ -301,12 +309,27 @@ def from_url(
301309
if always_enqueue:
302310
unique_key = f'{crypto_random_object_id()}|{unique_key}'
303311

312+
user_data_dict = kwargs.pop('user_data', {}) or {}
313+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
314+
315+
if max_retries is not None:
316+
crawlee_data_dict['maxRetries'] = max_retries
317+
318+
if enqueue_strategy is not None:
319+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320+
321+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322+
323+
if crawlee_data:
324+
user_data_dict['__crawlee'] = crawlee_data
325+
304326
request = cls(
305327
url=url,
306328
unique_key=unique_key,
307329
method=method,
308330
headers=headers,
309331
payload=payload,
332+
user_data=user_data_dict,
310333
**kwargs,
311334
)
312335

@@ -352,7 +375,7 @@ def crawl_depth(self, new_value: int) -> None:
352375
self.crawlee_data.crawl_depth = new_value
353376

354377
@property
355-
def state(self) -> RequestState | None:
378+
def state(self) -> RequestState:
356379
"""Crawlee-specific request handling state."""
357380
return self.crawlee_data.state
358381

@@ -365,10 +388,6 @@ def max_retries(self) -> int | None:
365388
"""Crawlee-specific limit on the number of retries of the request."""
366389
return self.crawlee_data.max_retries
367390

368-
@max_retries.setter
369-
def max_retries(self, new_max_retries: int) -> None:
370-
self.crawlee_data.max_retries = new_max_retries
371-
372391
@property
373392
def session_rotation_count(self) -> int | None:
374393
"""Crawlee-specific number of finished session rotations for the request."""

src/crawlee/_types.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,6 @@ def __init__(
266266
*,
267267
key_value_store_getter: GetKeyValueStoreFunction,
268268
request: Request,
269-
session: Session | None = None,
270269
) -> None:
271270
self._key_value_store_getter = key_value_store_getter
272271
self.add_requests_calls = list[AddRequestsKwargs]()
@@ -275,16 +274,11 @@ def __init__(
275274

276275
# Isolated copies for handler execution
277276
self._request = deepcopy(request)
278-
self._session = deepcopy(session) if session else None
279277

280278
@property
281279
def request(self) -> Request:
282280
return self._request
283281

284-
@property
285-
def session(self) -> Session | None:
286-
return self._session
287-
288282
async def add_requests(
289283
self,
290284
requests: Sequence[str | Request],
@@ -337,27 +331,11 @@ async def get_key_value_store(
337331
def apply_request_changes(self, target: Request) -> None:
338332
"""Apply tracked changes from handler copy to original request."""
339333
if self.request.user_data != target.user_data:
340-
target.user_data.update(self.request.user_data)
334+
target.user_data = self.request.user_data
341335

342336
if self.request.headers != target.headers:
343337
target.headers = target.headers | self.request.headers
344338

345-
def apply_session_changes(self, target: Session | None = None) -> None:
346-
"""Apply tracked changes from handler copy to original session."""
347-
simple_fields: set[str] = {'_usage_count', '_error_score'}
348-
349-
if self.session and target:
350-
if self.session.user_data != target.user_data:
351-
target.user_data.update(self.session.user_data)
352-
353-
if self.session.cookies != target.cookies:
354-
target.cookies.set_cookies(self.session.cookies.get_cookies_as_dicts())
355-
for field in simple_fields:
356-
value = getattr(self.session, field)
357-
original_value = getattr(target, field)
358-
if value != original_value:
359-
object.__setattr__(target, field, value)
360-
361339

362340
@docs_group('Functions')
363341
class AddRequestsFunction(Protocol):

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pydantic import ValidationError
1111
from typing_extensions import NotRequired, TypeVar
1212

13-
from crawlee._request import Request, RequestOptions
13+
from crawlee._request import Request, RequestOptions, RequestState
1414
from crawlee._utils.docs import docs_group
1515
from crawlee._utils.time import SharedTimeout
1616
from crawlee._utils.urls import to_absolute_url_iterator
@@ -257,6 +257,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
257257
timeout=remaining_timeout,
258258
)
259259

260+
context.request.state = RequestState.AFTER_NAV
260261
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
261262

262263
async def _handle_status_code_response(

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,13 @@ async def get_input_state(
291291

292292
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
293293
result = RequestHandlerRunResult(
294-
key_value_store_getter=self.get_key_value_store, request=context.request, session=context.session
294+
key_value_store_getter=self.get_key_value_store,
295+
request=context.request,
295296
)
296297
context_linked_to_result = BasicCrawlingContext(
297298
request=result.request,
298-
session=result.session,
299-
proxy_info=deepcopy(context.proxy_info),
299+
session=context.session,
300+
proxy_info=context.proxy_info,
300301
send_request=context.send_request,
301302
add_requests=result.add_requests,
302303
push_data=result.push_data,

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
RequestHandlerError,
6060
SessionError,
6161
UserDefinedErrorHandlerError,
62+
UserHandlerTimeoutError,
6263
)
6364
from crawlee.events._types import Event, EventCrawlerStatusData
6465
from crawlee.http_clients import ImpitHttpClient
@@ -1135,7 +1136,7 @@ async def _handle_request_retries(
11351136
request.retry_count += 1
11361137
reduced_error = str(error).split('\n')[0]
11371138
self.log.warning(
1138-
f'Retrying request to {context.request.url} due to: {reduced_error}'
1139+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
11391140
f'{get_one_line_error_summary_if_possible(error)}'
11401141
)
11411142
await self._statistics.error_tracker.add(error=error, context=context)
@@ -1153,6 +1154,7 @@ async def _handle_request_retries(
11531154

11541155
await request_manager.reclaim_request(request)
11551156
else:
1157+
request.state = RequestState.ERROR
11561158
await self._mark_request_as_handled(request)
11571159
await self._handle_failed_request(context, error)
11581160
self._statistics.record_request_processing_failure(request.unique_key)
@@ -1168,8 +1170,6 @@ async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingC
11681170
f'{self._internal_timeout.total_seconds()} seconds',
11691171
logger=self._logger,
11701172
)
1171-
1172-
context.request.state = RequestState.DONE
11731173
except UserDefinedErrorHandlerError:
11741174
context.request.state = RequestState.ERROR
11751175
raise
@@ -1202,8 +1202,8 @@ async def _handle_skipped_request(
12021202
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
12031203
) -> None:
12041204
if need_mark and isinstance(request, Request):
1205-
await self._mark_request_as_handled(request)
12061205
request.state = RequestState.SKIPPED
1206+
await self._mark_request_as_handled(request)
12071207

12081208
url = request.url if isinstance(request, Request) else request
12091209

@@ -1223,10 +1223,11 @@ def _get_message_from_error(self, error: Exception) -> str:
12231223

12241224
if (
12251225
isinstance(error, asyncio.exceptions.TimeoutError)
1226+
and traceback_parts
12261227
and self._request_handler_timeout_text in traceback_parts[-1]
1227-
):
1228+
) or isinstance(error, UserHandlerTimeoutError):
12281229
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1229-
used_traceback_parts.append(traceback_parts[-1])
1230+
used_traceback_parts.extend(traceback_parts[-1:])
12301231

12311232
return ''.join(used_traceback_parts).strip('\n')
12321233

@@ -1324,7 +1325,6 @@ async def _commit_request_handler_result(
13241325

13251326
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
13261327

1327-
result.apply_session_changes(target=context.session)
13281328
result.apply_request_changes(target=context.request)
13291329

13301330
@staticmethod
@@ -1392,13 +1392,11 @@ async def __run_task_function(self) -> None:
13921392
else:
13931393
session = await self._get_session()
13941394
proxy_info = await self._get_proxy_info(request, session)
1395-
result = RequestHandlerRunResult(
1396-
key_value_store_getter=self.get_key_value_store, request=request, session=session
1397-
)
1395+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
13981396

13991397
context = BasicCrawlingContext(
14001398
request=result.request,
1401-
session=result.session,
1399+
session=session,
14021400
proxy_info=proxy_info,
14031401
send_request=self._prepare_send_request_function(session, proxy_info),
14041402
add_requests=result.add_requests,
@@ -1415,18 +1413,18 @@ async def __run_task_function(self) -> None:
14151413
request.state = RequestState.REQUEST_HANDLER
14161414

14171415
try:
1418-
with swaped_context(context, request, session):
1416+
with swaped_context(context, request):
14191417
self._check_request_collision(request, session)
14201418
await self._run_request_handler(context=context)
14211419
except asyncio.TimeoutError as e:
14221420
raise RequestHandlerError(e, context) from e
14231421

14241422
await self._commit_request_handler_result(context)
14251423

1426-
await self._mark_request_as_handled(request)
1427-
14281424
request.state = RequestState.DONE
14291425

1426+
await self._mark_request_as_handled(request)
1427+
14301428
if session and session.is_usable:
14311429
session.mark_good()
14321430

@@ -1493,6 +1491,7 @@ async def __run_task_function(self) -> None:
14931491
raise
14941492

14951493
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1494+
context.request.state = RequestState.BEFORE_NAV
14961495
await self._context_pipeline(
14971496
context,
14981497
lambda final_context: wait_for(

0 commit comments

Comments
 (0)