Skip to content

Commit 6ee60a0

Browse files
authored
fix: Save RequestQueueState for FileSystemRequestQueueClient in default KVS (#1411)
### Description - Save `RequestQueueState` for `FileSystemRequestQueueClient` in the default KVS. ### Issues - Closes: #1410
1 parent 8357813 commit 6ee60a0

File tree

3 files changed

+10
-4
lines changed

3 files changed

+10
-4
lines changed

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,8 @@ def __init__(
115115

116116
self._state = RecoverableState[RequestQueueState](
117117
default_state=RequestQueueState(),
118-
persist_state_key='request_queue_state',
118+
persist_state_key=f'__RQ_STATE_{self._metadata.id}',
119119
persistence_enabled=True,
120-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
121120
logger=logger,
122121
)
123122
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -565,11 +565,15 @@ async def request_handler(context: HttpCrawlingContext) -> None:
565565
kvs = await crawler.get_key_value_store()
566566
kvs_content = {}
567567
async for key_info in kvs.iterate_keys():
568+
# Skip any non-error snapshot keys, e.g. __RQ_STATE_.
569+
if 'ERROR_SNAPSHOT' not in key_info.key:
570+
continue
568571
kvs_content[key_info.key] = await kvs.get_value(key_info.key)
569572

570573
# One error, three time retried.
574+
content_key = next(iter(kvs_content))
571575
assert crawler.statistics.error_tracker.total == 4
572576
assert crawler.statistics.error_tracker.unique_error_count == 1
573577
assert len(kvs_content) == 1
574-
assert key_info.key.endswith('.html')
575-
assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf8')
578+
assert content_key.endswith('.html')
579+
assert kvs_content[content_key] == HELLO_WORLD.decode('utf8')

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
631631
kvs_content = {}
632632

633633
async for key_info in kvs.iterate_keys():
634+
# Skip any non-error snapshot keys, e.g. __RQ_STATE_.
635+
if 'ERROR_SNAPSHOT' not in key_info.key:
636+
continue
634637
kvs_content[key_info.key] = await kvs.get_value(key_info.key)
635638

636639
assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS)

0 commit comments

Comments
 (0)