3
3
import asyncio
4
4
import logging
5
5
import math
6
- from dataclasses import dataclass
7
- from datetime import timedelta
6
+ from collections .abc import Iterable
8
7
from queue import Queue
9
- from time import sleep
10
- from typing import TYPE_CHECKING , Any , TypedDict
8
+ from typing import Any , TypedDict
11
9
12
10
from apify_shared .utils import filter_out_none_values_recursively , ignore_docs , parse_date_fields
13
11
from more_itertools import constrained_batches
16
14
from apify_client ._utils import catch_not_found_or_throw , pluck_data
17
15
from apify_client .clients .base import ResourceClient , ResourceClientAsync
18
16
19
- if TYPE_CHECKING :
20
- from collections .abc import Iterable
21
-
22
17
logger = logging .getLogger (__name__ )
23
18
24
19
_RQ_MAX_REQUESTS_PER_BATCH = 25
@@ -41,17 +36,9 @@ class BatchAddRequestsResult(TypedDict):
41
36
unprocessedRequests : list [dict ]
42
37
43
38
44
- @dataclass
45
- class AddRequestsBatch :
46
- """Batch of requests to add to the request queue.
47
-
48
- Args:
49
- requests: List of requests to be added to the request queue.
50
- num_of_retries: Number of times this batch has been retried.
51
- """
52
-
53
- requests : Iterable [dict ]
54
- num_of_retries : int = 0
39
+ def _get_unprocessed_request_from_request (request : dict [str , str ]) -> dict [str , str ]:
40
+ relevant_keys = {'url' , 'uniqueKey' , 'method' }
41
+ return {key : value for key , value in request .items () if key in relevant_keys }
55
42
56
43
57
44
class RequestQueueClient (ResourceClient ):
@@ -297,8 +284,6 @@ def batch_add_requests(
297
284
* ,
298
285
forefront : bool = False ,
299
286
max_parallel : int = 1 ,
300
- max_unprocessed_requests_retries : int = 3 ,
301
- min_delay_between_unprocessed_requests_retries : timedelta = timedelta (milliseconds = 500 ),
302
287
) -> BatchAddRequestsResult :
303
288
"""Add requests to the request queue in batches.
304
289
@@ -312,9 +297,6 @@ def batch_add_requests(
312
297
max_parallel: Specifies the maximum number of parallel tasks for API calls. This is only applicable
313
298
to the async client. For the sync client, this value must be set to 1, as parallel execution
314
299
is not supported.
315
- max_unprocessed_requests_retries: Number of retry attempts for unprocessed requests.
316
- min_delay_between_unprocessed_requests_retries: Minimum delay between retry attempts for unprocessed
317
- requests.
318
300
319
301
Returns:
320
302
Result containing lists of processed and unprocessed requests.
@@ -335,42 +317,43 @@ def batch_add_requests(
335
317
)
336
318
337
319
# Put the batches into the queue for processing.
338
- queue = Queue [AddRequestsBatch ]()
320
+ queue = Queue [Iterable [ dict ] ]()
339
321
340
- for b in batches :
341
- queue .put (AddRequestsBatch ( b ) )
322
+ for batch in batches :
323
+ queue .put (batch )
342
324
343
325
processed_requests = list [dict ]()
344
- unprocessed_requests = list [ dict ]()
326
+ unprocessed_requests = dict [ str , dict ]()
345
327
346
328
# Process all batches in the queue sequentially.
347
329
while not queue .empty ():
348
- batch = queue .get ()
330
+ request_batch = queue .get ()
331
+ # All requests are considered unprocessed unless explicitly mentioned in `processedRequests` response.
332
+ for request in request_batch :
333
+ unprocessed_requests [request ['uniqueKey' ]] = _get_unprocessed_request_from_request (request )
349
334
350
- # Send the batch to the API.
351
- response = self .http_client .call (
352
- url = self ._url ('requests/batch' ),
353
- method = 'POST' ,
354
- params = request_params ,
355
- json = list (batch .requests ),
356
- timeout_secs = _MEDIUM_TIMEOUT ,
357
- )
358
-
359
- # Retry if the request failed and the retry limit has not been reached.
360
- if not response .is_success and batch .num_of_retries < max_unprocessed_requests_retries :
361
- batch .num_of_retries += 1
362
- sleep (min_delay_between_unprocessed_requests_retries .total_seconds ())
363
- queue .put (batch )
335
+ try :
336
+ # Send the batch to the API.
337
+ response = self .http_client .call (
338
+ url = self ._url ('requests/batch' ),
339
+ method = 'POST' ,
340
+ params = request_params ,
341
+ json = list (request_batch ),
342
+ timeout_secs = _MEDIUM_TIMEOUT ,
343
+ )
364
344
365
- # Otherwise, add the processed/unprocessed requests to their respective lists.
366
- else :
367
345
response_parsed = parse_date_fields (pluck_data (response .json ()))
368
346
processed_requests .extend (response_parsed .get ('processedRequests' , []))
369
- unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
347
+
348
+ for processed_request in response_parsed .get ('processedRequests' , []):
349
+ unprocessed_requests .pop (processed_request ['uniqueKey' ], None )
350
+
351
+ except Exception as exc :
352
+ logger .warning (f'Error occurred while processing a batch of requests: { exc } ' )
370
353
371
354
return {
372
355
'processedRequests' : processed_requests ,
373
- 'unprocessedRequests' : unprocessed_requests ,
356
+ 'unprocessedRequests' : list ( unprocessed_requests . values ()) ,
374
357
}
375
358
376
359
def batch_delete_requests (self , requests : list [dict ]) -> dict :
@@ -661,24 +644,26 @@ async def delete_request_lock(
661
644
662
645
async def _batch_add_requests_worker (
663
646
self ,
664
- queue : asyncio .Queue [AddRequestsBatch ],
647
+ queue : asyncio .Queue [Iterable [ dict ] ],
665
648
request_params : dict ,
666
- max_unprocessed_requests_retries : int ,
667
- min_delay_between_unprocessed_requests_retries : timedelta ,
668
649
) -> BatchAddRequestsResult :
669
650
"""Worker function to process a batch of requests.
670
651
671
- This worker will process batches from the queue, retrying requests that fail until the retry limit is reached .
652
+ This worker will process batches from the queue.
672
653
673
654
Return result containing lists of processed and unprocessed requests by the worker.
674
655
"""
675
656
processed_requests = list [dict ]()
676
- unprocessed_requests = list [ dict ]()
657
+ unprocessed_requests = dict [ str , dict ]()
677
658
678
659
while True :
679
660
# Get the next batch from the queue.
680
661
try :
681
- batch = await queue .get ()
662
+ request_batch = await queue .get ()
663
+ # All requests are considered unprocessed unless explicitly mentioned in `processedRequests` response.
664
+ for request in request_batch :
665
+ unprocessed_requests [request ['uniqueKey' ]] = _get_unprocessed_request_from_request (request )
666
+
682
667
except asyncio .CancelledError :
683
668
break
684
669
@@ -688,22 +673,15 @@ async def _batch_add_requests_worker(
688
673
url = self ._url ('requests/batch' ),
689
674
method = 'POST' ,
690
675
params = request_params ,
691
- json = list (batch . requests ),
676
+ json = list (request_batch ),
692
677
timeout_secs = _MEDIUM_TIMEOUT ,
693
678
)
694
679
695
680
response_parsed = parse_date_fields (pluck_data (response .json ()))
681
+ processed_requests .extend (response_parsed .get ('processedRequests' , []))
696
682
697
- # Retry if the request failed and the retry limit has not been reached.
698
- if not response .is_success and batch .num_of_retries < max_unprocessed_requests_retries :
699
- batch .num_of_retries += 1
700
- await asyncio .sleep (min_delay_between_unprocessed_requests_retries .total_seconds ())
701
- await queue .put (batch )
702
-
703
- # Otherwise, add the processed/unprocessed requests to their respective lists.
704
- else :
705
- processed_requests .extend (response_parsed .get ('processedRequests' , []))
706
- unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
683
+ for processed_request in response_parsed .get ('processedRequests' , []):
684
+ unprocessed_requests .pop (processed_request ['uniqueKey' ], None )
707
685
708
686
except Exception as exc :
709
687
logger .warning (f'Error occurred while processing a batch of requests: { exc } ' )
@@ -714,7 +692,7 @@ async def _batch_add_requests_worker(
714
692
715
693
return {
716
694
'processedRequests' : processed_requests ,
717
- 'unprocessedRequests' : unprocessed_requests ,
695
+ 'unprocessedRequests' : list ( unprocessed_requests . values ()) ,
718
696
}
719
697
720
698
async def batch_add_requests (
@@ -723,8 +701,6 @@ async def batch_add_requests(
723
701
* ,
724
702
forefront : bool = False ,
725
703
max_parallel : int = 5 ,
726
- max_unprocessed_requests_retries : int = 3 ,
727
- min_delay_between_unprocessed_requests_retries : timedelta = timedelta (milliseconds = 500 ),
728
704
) -> BatchAddRequestsResult :
729
705
"""Add requests to the request queue in batches.
730
706
@@ -738,15 +714,12 @@ async def batch_add_requests(
738
714
max_parallel: Specifies the maximum number of parallel tasks for API calls. This is only applicable
739
715
to the async client. For the sync client, this value must be set to 1, as parallel execution
740
716
is not supported.
741
- max_unprocessed_requests_retries: Number of retry attempts for unprocessed requests.
742
- min_delay_between_unprocessed_requests_retries: Minimum delay between retry attempts for unprocessed
743
- requests.
744
717
745
718
Returns:
746
719
Result containing lists of processed and unprocessed requests.
747
720
"""
748
721
tasks = set [asyncio .Task ]()
749
- queue : asyncio .Queue [AddRequestsBatch ] = asyncio .Queue ()
722
+ queue : asyncio .Queue [Iterable [ dict ] ] = asyncio .Queue ()
750
723
request_params = self ._params (clientKey = self .client_key , forefront = forefront )
751
724
752
725
# Compute the payload size limit to ensure it doesn't exceed the maximum allowed size.
@@ -760,15 +733,13 @@ async def batch_add_requests(
760
733
)
761
734
762
735
for batch in batches :
763
- await queue .put (AddRequestsBatch ( batch ) )
736
+ await queue .put (batch )
764
737
765
738
# Start a required number of worker tasks to process the batches.
766
739
for i in range (max_parallel ):
767
740
coro = self ._batch_add_requests_worker (
768
741
queue ,
769
742
request_params ,
770
- max_unprocessed_requests_retries ,
771
- min_delay_between_unprocessed_requests_retries ,
772
743
)
773
744
task = asyncio .create_task (coro , name = f'batch_add_requests_worker_{ i } ' )
774
745
tasks .add (task )
0 commit comments