Skip to content

Commit 079f890

Browse files
committed
Merge remote-tracking branch 'origin/master' into add-deduplication
2 parents 65b297a + 5d01ce4 commit 079f890

17 files changed

+686
-249
lines changed

.github/workflows/build_and_deploy_docs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222

2323
steps:
2424
- name: Checkout repository
25-
uses: actions/checkout@v4
25+
uses: actions/checkout@v5
2626
with:
2727
token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
2828

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ All notable changes to this project will be documented in this file.
88
### 🐛 Bug Fixes
99

1010
- Restrict apify-shared and apify-client versions ([#523](https://github.com/apify/apify-sdk-python/pull/523)) ([b3ae5a9](https://github.com/apify/apify-sdk-python/commit/b3ae5a972a65454a4998eda59c9fcc3f6b7e8579)) by [@vdusek](https://github.com/vdusek)
11+
- Expose `APIFY_USER_IS_PAYING` env var to the configuration ([#507](https://github.com/apify/apify-sdk-python/pull/507)) ([0801e54](https://github.com/apify/apify-sdk-python/commit/0801e54887317c1280cc6828ecd3f2cc53287e76)) by [@stepskop](https://github.com/stepskop)
12+
13+
### Refactor
14+
15+
- [**breaking**] Adapt to the Crawlee v1.0 ([#470](https://github.com/apify/apify-sdk-python/pull/470)) ([f7e3320](https://github.com/apify/apify-sdk-python/commit/f7e33206cf3e4767faacbdc43511b45b6785f929)) by [@vdusek](https://github.com/vdusek), closes [#469](https://github.com/apify/apify-sdk-python/issues/469), [#540](https://github.com/apify/apify-sdk-python/issues/540)
1116

1217

1318
<!-- git-cliff-unreleased-end -->
@@ -35,6 +40,13 @@ All notable changes to this project will be documented in this file.
3540
- Tagline overlap ([#501](https://github.com/apify/apify-sdk-python/pull/501)) ([bae8340](https://github.com/apify/apify-sdk-python/commit/bae8340c46fea756ea35ea4d591da84c09d478e2)) by [@katzino](https://github.com/katzino)
3641

3742

43+
## [2.7.3](https://github.com/apify/apify-sdk-python/releases/tag/v2.7.3) (2025-08-11)
44+
45+
### 🐛 Bug Fixes
46+
47+
- Expose `APIFY_USER_IS_PAYING` env var to the configuration (#507) ([0de022c](https://github.com/apify/apify-sdk-python/commit/0de022c3435f24c821053c771e7b659433e3fb6e))
48+
49+
3850
## [2.7.2](https://github.com/apify/apify-sdk-python/releases/tag/v2.7.2) (2025-07-30)
3951

4052
### 🐛 Bug Fixes

pyproject.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,20 +66,22 @@ dev = [
6666
"build~=1.3.0",
6767
"crawlee[parsel]",
6868
"dycw-pytest-only>=2.1.1",
69-
"griffe~=1.9.0",
69+
"griffe~=1.11.0",
7070
"mypy~=1.17.0",
71-
"pre-commit~=4.2.0",
71+
"pre-commit~=4.3.0",
7272
"pydoc-markdown~=4.8.0",
7373
"pytest-asyncio~=1.1.0",
7474
"pytest-cov~=6.2.0",
75+
"pytest-httpserver>=1.1.3",
7576
"pytest-timeout>=2.4.0",
7677
"pytest-xdist~=3.8.0",
7778
"pytest~=8.4.0",
78-
"respx~=0.22.0",
7979
"ruff~=0.12.0",
8080
"setuptools", # setuptools are used by pytest but not explicitly required
8181
"types-cachetools>=6.0.0.20250525",
8282
"uvicorn[standard]",
83+
"werkzeug~=3.1.3", # Werkzeug is used by httpserver
84+
"yarl~=1.20.0", # yarl is used by crawlee
8385
]
8486

8587
[tool.hatch.build.targets.wheel]

src/apify/_configuration.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,15 @@ class Configuration(CrawleeConfiguration):
367367
),
368368
] = None
369369

370+
user_is_paying: Annotated[
371+
bool,
372+
Field(
373+
alias='apify_user_is_paying',
374+
description='True if the user calling the Actor is paying user',
375+
),
376+
BeforeValidator(lambda val: False if val == '' else val),
377+
] = False
378+
370379
web_server_port: Annotated[
371380
int,
372381
Field(

src/apify/storage_clients/_apify/_models.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,3 @@ class CachedRequest(BaseModel):
105105

106106
lock_expires_at: datetime | None = None
107107
"""The expiration time of the lock on the request."""
108-
109-
forefront: bool = False
110-
"""Whether the request was added to the forefront of the queue."""

src/apify/storage_clients/_apify/_request_queue_client.py

Lines changed: 66 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
from collections import deque
45
from datetime import datetime, timedelta, timezone
56
from logging import getLogger
@@ -84,6 +85,9 @@ def __init__(
8485
self._assumed_handled_count = 0
8586
"""The number of requests we assume have been handled (tracked manually for this instance)."""
8687

88+
self._fetch_lock = asyncio.Lock()
89+
"""Fetch lock to minimize race conditions when communicating with API."""
90+
8791
@override
8892
async def get_metadata(self) -> RequestQueueMetadata:
8993
total_count = self._initial_total_count + self._assumed_total_count
@@ -268,7 +272,6 @@ async def add_batch_of_requests(
268272
self._cache_request(
269273
unique_key_to_request_id(request.unique_key),
270274
processed_request,
271-
forefront=False,
272275
)
273276
new_requests.append(request)
274277

@@ -334,15 +337,17 @@ async def fetch_next_request(self) -> Request | None:
334337
Returns:
335338
The request or `None` if there are no more pending requests.
336339
"""
337-
# Ensure the queue head has requests if available
338-
await self._ensure_head_is_non_empty()
340+
# Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions.
341+
async with self._fetch_lock:
342+
await self._ensure_head_is_non_empty()
339343

340-
# If queue head is empty after ensuring, there are no requests
341-
if not self._queue_head:
342-
return None
344+
# If queue head is empty after ensuring, there are no requests
345+
if not self._queue_head:
346+
return None
347+
348+
# Get the next request ID from the queue head
349+
next_request_id = self._queue_head.popleft()
343350

344-
# Get the next request ID from the queue head
345-
next_request_id = self._queue_head.popleft()
346351
request = await self._get_or_hydrate_request(next_request_id)
347352

348353
# Handle potential inconsistency where request might not be in the main table yet
@@ -388,6 +393,8 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest |
388393
if request.handled_at is None:
389394
request.handled_at = datetime.now(tz=timezone.utc)
390395

396+
if cached_request := self._requests_cache[request.id]:
397+
cached_request.was_already_handled = request.was_already_handled
391398
try:
392399
# Update the request in the API
393400
processed_request = await self._update_request(request)
@@ -402,7 +409,6 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest |
402409
self._cache_request(
403410
cache_key,
404411
processed_request,
405-
forefront=False,
406412
hydrated_request=request,
407413
)
408414
except Exception as exc:
@@ -434,40 +440,41 @@ async def reclaim_request(
434440
if request.was_already_handled:
435441
request.handled_at = None
436442

437-
try:
438-
# Update the request in the API.
439-
processed_request = await self._update_request(request, forefront=forefront)
440-
processed_request.unique_key = request.unique_key
443+
# Reclaim with lock to prevent race conditions that could lead to double processing of the same request.
444+
async with self._fetch_lock:
445+
try:
446+
# Update the request in the API.
447+
processed_request = await self._update_request(request, forefront=forefront)
448+
processed_request.unique_key = request.unique_key
441449

442-
# If the request was previously handled, decrement our handled count since
443-
# we're putting it back for processing.
444-
if request.was_already_handled and not processed_request.was_already_handled:
445-
self._assumed_handled_count -= 1
450+
# If the request was previously handled, decrement our handled count since
451+
# we're putting it back for processing.
452+
if request.was_already_handled and not processed_request.was_already_handled:
453+
self._assumed_handled_count -= 1
446454

447-
# Update the cache
448-
cache_key = unique_key_to_request_id(request.unique_key)
449-
self._cache_request(
450-
cache_key,
451-
processed_request,
452-
forefront=forefront,
453-
hydrated_request=request,
454-
)
455+
# Update the cache
456+
cache_key = unique_key_to_request_id(request.unique_key)
457+
self._cache_request(
458+
cache_key,
459+
processed_request,
460+
hydrated_request=request,
461+
)
455462

456-
# If we're adding to the forefront, we need to check for forefront requests
457-
# in the next list_head call
458-
if forefront:
459-
self._should_check_for_forefront_requests = True
463+
# If we're adding to the forefront, we need to check for forefront requests
464+
# in the next list_head call
465+
if forefront:
466+
self._should_check_for_forefront_requests = True
460467

461-
# Try to release the lock on the request
462-
try:
463-
await self._delete_request_lock(request.id, forefront=forefront)
464-
except Exception as err:
465-
logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err)
466-
except Exception as exc:
467-
logger.debug(f'Error reclaiming request {request.id}: {exc!s}')
468-
return None
469-
else:
470-
return processed_request
468+
# Try to release the lock on the request
469+
try:
470+
await self._delete_request_lock(request.id, forefront=forefront)
471+
except Exception as err:
472+
logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err)
473+
except Exception as exc:
474+
logger.debug(f'Error reclaiming request {request.id}: {exc!s}')
475+
return None
476+
else:
477+
return processed_request
471478

472479
@override
473480
async def is_empty(self) -> bool:
@@ -476,9 +483,11 @@ async def is_empty(self) -> bool:
476483
Returns:
477484
True if the queue is empty, False otherwise.
478485
"""
479-
head = await self._list_head(limit=1, lock_time=None)
480-
481-
return len(head.items) == 0 and not self._queue_has_locked_requests
486+
# Check _list_head and self._queue_has_locked_requests with lock to make sure they are consistent.
487+
# Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
488+
async with self._fetch_lock:
489+
head = await self._list_head(limit=1, lock_time=None)
490+
return len(head.items) == 0 and not self._queue_has_locked_requests
482491

483492
async def _ensure_head_is_non_empty(self) -> None:
484493
"""Ensure that the queue head has requests if they are available in the queue."""
@@ -507,9 +516,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
507516
# Try to prolong the lock if it's expired
508517
try:
509518
lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds())
510-
response = await self._prolong_request_lock(
511-
request_id, forefront=cached_entry.forefront, lock_secs=lock_secs
512-
)
519+
response = await self._prolong_request_lock(request_id, lock_secs=lock_secs)
513520
cached_entry.lock_expires_at = response.lock_expires_at
514521
except Exception:
515522
# If prolonging the lock fails, we lost the request
@@ -522,7 +529,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
522529
try:
523530
# Try to acquire or prolong the lock
524531
lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds())
525-
await self._prolong_request_lock(request_id, forefront=False, lock_secs=lock_secs)
532+
await self._prolong_request_lock(request_id, lock_secs=lock_secs)
526533

527534
# Fetch the request data
528535
request = await self.get_request(request_id)
@@ -542,7 +549,6 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
542549
was_already_present=True,
543550
was_already_handled=request.handled_at is not None,
544551
),
545-
forefront=False,
546552
hydrated_request=request,
547553
)
548554
except Exception as exc:
@@ -594,7 +600,6 @@ async def _list_head(
594600
# Return from cache if available and we're not checking for new forefront requests
595601
if self._queue_head and not self._should_check_for_forefront_requests:
596602
logger.debug(f'Using cached queue head with {len(self._queue_head)} requests')
597-
598603
# Create a list of requests from the cached queue head
599604
items = []
600605
for request_id in list(self._queue_head)[:limit]:
@@ -612,6 +617,11 @@ async def _list_head(
612617
queue_has_locked_requests=self._queue_has_locked_requests,
613618
lock_time=lock_time,
614619
)
620+
leftover_buffer = list[str]()
621+
if self._should_check_for_forefront_requests:
622+
leftover_buffer = list(self._queue_head)
623+
self._queue_head.clear()
624+
self._should_check_for_forefront_requests = False
615625

616626
# Otherwise fetch from API
617627
lock_time = lock_time or self._DEFAULT_LOCK_TIME
@@ -625,15 +635,6 @@ async def _list_head(
625635
# Update the queue head cache
626636
self._queue_has_locked_requests = response.get('queueHasLockedRequests', False)
627637

628-
# Clear current queue head if we're checking for forefront requests
629-
if self._should_check_for_forefront_requests:
630-
self._queue_head.clear()
631-
self._should_check_for_forefront_requests = False
632-
633-
# Process and cache the requests
634-
head_id_buffer = list[str]()
635-
forefront_head_id_buffer = list[str]()
636-
637638
for request_data in response.get('items', []):
638639
request = Request.model_validate(request_data)
639640

@@ -648,59 +649,44 @@ async def _list_head(
648649
)
649650
continue
650651

651-
# Check if this request was already cached and if it was added to forefront
652-
cache_key = unique_key_to_request_id(request.unique_key)
653-
cached_request = self._requests_cache.get(cache_key)
654-
forefront = cached_request.forefront if cached_request else False
655-
656-
# Add to appropriate buffer based on forefront flag
657-
if forefront:
658-
forefront_head_id_buffer.insert(0, request.id)
659-
else:
660-
head_id_buffer.append(request.id)
661-
662652
# Cache the request
663653
self._cache_request(
664-
cache_key,
654+
unique_key_to_request_id(request.unique_key),
665655
ProcessedRequest(
666656
id=request.id,
667657
unique_key=request.unique_key,
668658
was_already_present=True,
669659
was_already_handled=False,
670660
),
671-
forefront=forefront,
672661
hydrated_request=request,
673662
)
663+
self._queue_head.append(request.id)
674664

675-
# Update the queue head deque
676-
for request_id in head_id_buffer:
677-
self._queue_head.append(request_id)
678-
679-
for request_id in forefront_head_id_buffer:
680-
self._queue_head.appendleft(request_id)
681-
665+
for leftover_request_id in leftover_buffer:
666+
# After adding new requests to the forefront, any existing leftover locked request is kept in the end.
667+
self._queue_head.append(leftover_request_id)
682668
return RequestQueueHead.model_validate(response)
683669

684670
async def _prolong_request_lock(
685671
self,
686672
request_id: str,
687673
*,
688-
forefront: bool = False,
689674
lock_secs: int,
690675
) -> ProlongRequestLockResponse:
691676
"""Prolong the lock on a specific request in the queue.
692677
693678
Args:
694679
request_id: The identifier of the request whose lock is to be prolonged.
695-
forefront: Whether to put the request in the beginning or the end of the queue after lock expires.
696680
lock_secs: The additional amount of time, in seconds, that the request will remain locked.
697681
698682
Returns:
699683
A response containing the time at which the lock will expire.
700684
"""
701685
response = await self._api_client.prolong_request_lock(
702686
request_id=request_id,
703-
forefront=forefront,
687+
# All requests reaching this code were the tip of the queue at the moment when they were fetched,
688+
# so if their lock expires, they should be put back to the forefront as their handling is long overdue.
689+
forefront=True,
704690
lock_secs=lock_secs,
705691
)
706692

@@ -747,7 +733,6 @@ def _cache_request(
747733
cache_key: str,
748734
processed_request: ProcessedRequest,
749735
*,
750-
forefront: bool,
751736
hydrated_request: Request | None = None,
752737
) -> None:
753738
"""Cache a request for future use.
@@ -763,5 +748,4 @@ def _cache_request(
763748
was_already_handled=processed_request.was_already_handled,
764749
hydrated=hydrated_request,
765750
lock_expires_at=None,
766-
forefront=forefront,
767751
)

0 commit comments

Comments
 (0)