Skip to content

Commit ae6d86b

Browse files
authored
fix: Freeze core Request fields (#1603)
### Description - Ensures that core `Request` fields such as `unique_key`, `method`, and others will not be changed.
1 parent c6cb0e6 commit ae6d86b

File tree

1 file changed

+27
-8
lines changed

1 file changed

+27
-8
lines changed

src/crawlee/_request.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class RequestState(IntEnum):
3434
class CrawleeRequestData(BaseModel):
3535
"""Crawlee-specific configuration stored in the `user_data`."""
3636

37-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
3838
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
3939
`BasicCrawler`."""
4040

@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137137
always_enqueue: NotRequired[bool]
138138
user_data: NotRequired[dict[str, JsonSerializable]]
139139
no_retry: NotRequired[bool]
140+
enqueue_strategy: NotRequired[EnqueueStrategy]
141+
max_retries: NotRequired[int | None]
140142

141143

142144
@docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166168

167169
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168170

169-
unique_key: Annotated[str, Field(alias='uniqueKey')]
171+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170172
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171173
to the same URL.
172174
@@ -178,17 +180,18 @@ class Request(BaseModel):
178180
and specify which URLs shall be considered equal.
179181
"""
180182

181-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182184
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183185
and fragments."""
184186

185-
method: HttpMethod = 'GET'
187+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186188
"""HTTP request method."""
187189

188190
payload: Annotated[
189191
HttpPayload | None,
190192
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191193
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194+
Field(frozen=True),
192195
] = None
193196
"""HTTP request payload."""
194197

@@ -250,6 +253,8 @@ def from_url(
250253
keep_url_fragment: bool = False,
251254
use_extended_unique_key: bool = False,
252255
always_enqueue: bool = False,
256+
enqueue_strategy: EnqueueStrategy | None = None,
257+
max_retries: int | None = None,
253258
**kwargs: Any,
254259
) -> Self:
255260
"""Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ def from_url(
277282
`unique_key` computation. This is only relevant when `unique_key` is not provided.
278283
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279284
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285+
enqueue_strategy: The strategy that will be used for enqueuing the request.
286+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287+
option of `BasicCrawler`.
280288
**kwargs: Additional request properties.
281289
"""
282290
if unique_key is not None and always_enqueue:
@@ -301,12 +309,27 @@ def from_url(
301309
if always_enqueue:
302310
unique_key = f'{crypto_random_object_id()}|{unique_key}'
303311

312+
user_data_dict = kwargs.pop('user_data', {}) or {}
313+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
314+
315+
if max_retries is not None:
316+
crawlee_data_dict['maxRetries'] = max_retries
317+
318+
if enqueue_strategy is not None:
319+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320+
321+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322+
323+
if crawlee_data:
324+
user_data_dict['__crawlee'] = crawlee_data
325+
304326
request = cls(
305327
url=url,
306328
unique_key=unique_key,
307329
method=method,
308330
headers=headers,
309331
payload=payload,
332+
user_data=user_data_dict,
310333
**kwargs,
311334
)
312335

@@ -365,10 +388,6 @@ def max_retries(self) -> int | None:
365388
"""Crawlee-specific limit on the number of retries of the request."""
366389
return self.crawlee_data.max_retries
367390

368-
@max_retries.setter
369-
def max_retries(self, new_max_retries: int) -> None:
370-
self.crawlee_data.max_retries = new_max_retries
371-
372391
@property
373392
def session_rotation_count(self) -> int | None:
374393
"""Crawlee-specific number of finished session rotations for the request."""

0 commit comments

Comments
 (0)