Skip to content

Commit 94e0f39

Browse files
committed
Add purge_if_needed method and improve some typing based on Pylance
1 parent 60e0968 commit 94e0f39

17 files changed

+107
-65
lines changed

src/crawlee/_utils/docs.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from __future__ import annotations
22

3-
from typing import Callable, Literal
3+
from typing import Any, Callable, Literal, TypeVar
44

55
GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Event payloads', 'Errors', 'Functions']
66

7+
T = TypeVar('T', bound=Callable[..., Any])
78

8-
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
9+
10+
def docs_group(group_name: GroupName) -> Callable[[T], T]: # noqa: ARG001
911
"""Mark a symbol for rendering and grouping in documentation.
1012
1113
This decorator is used solely for documentation purposes and does not modify the behavior
@@ -18,7 +20,7 @@ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
1820
The original callable without modification.
1921
"""
2022

21-
def wrapper(func: Callable) -> Callable:
23+
def wrapper(func: T) -> T:
2224
return func
2325

2426
return wrapper

src/crawlee/_utils/file.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _sync_write() -> str:
128128

129129

130130
async def export_json_to_stream(
131-
iterator: AsyncIterator[dict],
131+
iterator: AsyncIterator[dict[str, Any]],
132132
dst: TextIO,
133133
**kwargs: Unpack[ExportDataJsonKwargs],
134134
) -> None:
@@ -137,7 +137,7 @@ async def export_json_to_stream(
137137

138138

139139
async def export_csv_to_stream(
140-
iterator: AsyncIterator[dict],
140+
iterator: AsyncIterator[dict[str, Any]],
141141
dst: TextIO,
142142
**kwargs: Unpack[ExportDataCsvKwargs],
143143
) -> None:

src/crawlee/request_loaders/_request_list.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,13 @@ def __init__(
5454
def name(self) -> str | None:
5555
return self._name
5656

57-
@override
5857
@property
58+
@override
5959
async def handled_count(self) -> int:
6060
return self._handled_count
6161

62-
@override
6362
@property
63+
@override
6464
async def total_count(self) -> int:
6565
return self._assumed_total_count
6666

src/crawlee/request_loaders/_request_manager_tandem.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ def __init__(self, request_loader: RequestLoader, request_manager: RequestManage
3232
self._read_only_loader = request_loader
3333
self._read_write_manager = request_manager
3434

35-
@override
3635
@property
36+
@override
3737
async def handled_count(self) -> int:
3838
return await self._read_write_manager.handled_count
3939

40-
@override
4140
@property
41+
@override
4242
async def total_count(self) -> int:
4343
return (await self._read_only_loader.total_count) + (await self._read_write_manager.total_count)
4444

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ async def iterate_items(
112112
unwind: str | None = None,
113113
skip_empty: bool = False,
114114
skip_hidden: bool = False,
115-
) -> AsyncIterator[dict]:
115+
) -> AsyncIterator[dict[str, Any]]:
116116
"""Iterate over the dataset items with filtering options.
117117
118118
The backend method for the `Dataset.iterate_items` call.

src/crawlee/storage_clients/_base/_storage_client.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from abc import ABC, abstractmethod
44
from typing import TYPE_CHECKING
55

6+
from crawlee._utils.docs import docs_group
7+
68
if TYPE_CHECKING:
79
from crawlee.configuration import Configuration
810

@@ -11,8 +13,20 @@
1113
from ._request_queue_client import RequestQueueClient
1214

1315

16+
@docs_group('Abstract classes')
1417
class StorageClient(ABC):
15-
"""Base class for storage clients."""
18+
"""Base class for storage clients.
19+
20+
The `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's
21+
storage types: datasets, key-value stores, and request queues. It provides methods to open clients for
22+
each of these storage types and handles common functionality.
23+
24+
Storage clients implementations can be provided for various backends (file system, memory, databases,
25+
various cloud providers, etc.) to support different use cases from development to production environments.
26+
27+
Each storage client implementation is responsible for ensuring proper initialization, data persistence
28+
(where applicable), and consistent access patterns across all storage types it supports.
29+
"""
1630

1731
@abstractmethod
1832
async def open_dataset_client(
@@ -47,3 +61,21 @@ async def open_request_queue_client(
4761
def get_rate_limit_errors(self) -> dict[int, int]:
4862
"""Return statistics about rate limit errors encountered by the HTTP client in storage client."""
4963
return {}
64+
65+
async def _purge_if_needed(
66+
self,
67+
client: DatasetClient | KeyValueStoreClient | RequestQueueClient,
68+
configuration: Configuration,
69+
) -> None:
70+
"""Purge the client if needed.
71+
72+
The purge is only performed if the configuration indicates that it should be done and the client
73+
is not a named storage. Named storages are considered global and will typically outlive the run,
74+
so they are not purged.
75+
76+
Args:
77+
client: The storage client to potentially purge.
78+
configuration: Configuration that determines whether purging should occur.
79+
"""
80+
if configuration.purge_on_start and client.metadata.name is None:
81+
await client.purge()

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from datetime import datetime, timezone
77
from logging import getLogger
88
from pathlib import Path
9-
from typing import TYPE_CHECKING
9+
from typing import TYPE_CHECKING, Any
1010

1111
from pydantic import ValidationError
1212
from typing_extensions import override
@@ -19,7 +19,6 @@
1919

2020
if TYPE_CHECKING:
2121
from collections.abc import AsyncIterator
22-
from typing import Any
2322

2423
from crawlee.configuration import Configuration
2524

@@ -83,8 +82,8 @@ def __init__(
8382
self._lock = asyncio.Lock()
8483
"""A lock to ensure that only one operation is performed at a time."""
8584

86-
@override
8785
@property
86+
@override
8887
def metadata(self) -> DatasetMetadata:
8988
return self._metadata
9089

@@ -258,7 +257,7 @@ async def get_data(
258257
view: str | None = None,
259258
) -> DatasetItemsListPage:
260259
# Check for unsupported arguments and log a warning if found.
261-
unsupported_args = {
260+
unsupported_args: dict[str, Any] = {
262261
'clean': clean,
263262
'fields': fields,
264263
'omit': omit,
@@ -307,7 +306,7 @@ async def get_data(
307306
selected_files = selected_files[:limit]
308307

309308
# Read and parse each data file.
310-
items = []
309+
items = list[dict[str, Any]]()
311310
for file_path in selected_files:
312311
try:
313312
file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
@@ -353,9 +352,9 @@ async def iterate_items(
353352
unwind: str | None = None,
354353
skip_empty: bool = False,
355354
skip_hidden: bool = False,
356-
) -> AsyncIterator[dict]:
355+
) -> AsyncIterator[dict[str, Any]]:
357356
# Check for unsupported arguments and log a warning if found.
358-
unsupported_args = {
357+
unsupported_args: dict[str, Any] = {
359358
'clean': clean,
360359
'fields': fields,
361360
'omit': omit,

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ def __init__(
7979
self._lock = asyncio.Lock()
8080
"""A lock to ensure that only one operation is performed at a time."""
8181

82-
@override
8382
@property
83+
@override
8484
def metadata(self) -> KeyValueStoreMetadata:
8585
return self._metadata
8686

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ def __init__(
9999
self._sequence_counter = 0
100100
"""A counter to track the order of requests added to the queue."""
101101

102-
@override
103102
@property
103+
@override
104104
def metadata(self) -> RequestQueueMetadata:
105105
return self._metadata
106106

src/crawlee/storage_clients/_file_system/_storage_client.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from typing_extensions import override
44

5+
from crawlee._utils.docs import docs_group
56
from crawlee.configuration import Configuration
67
from crawlee.storage_clients._base import StorageClient
78

@@ -10,8 +11,20 @@
1011
from ._request_queue_client import FileSystemRequestQueueClient
1112

1213

14+
@docs_group('Classes')
1315
class FileSystemStorageClient(StorageClient):
14-
"""File system storage client."""
16+
"""File system implementation of the storage client.
17+
18+
This storage client provides access to datasets, key-value stores, and request queues that persist data
19+
to the local file system. Each storage type is implemented with its own specific file system client
20+
that stores data in a structured directory hierarchy.
21+
22+
Data is stored in JSON format in predictable file paths, making it easy to inspect and manipulate
23+
the stored data outside of the Crawlee application if needed.
24+
25+
All data persists between program runs but is limited to access from the local machine
26+
where the files are stored.
27+
"""
1528

1629
@override
1730
async def open_dataset_client(
@@ -23,10 +36,7 @@ async def open_dataset_client(
2336
) -> FileSystemDatasetClient:
2437
configuration = configuration or Configuration.get_global_configuration()
2538
client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
26-
27-
if configuration.purge_on_start and client.metadata.name is None:
28-
await client.purge()
29-
39+
await self._purge_if_needed(client, configuration)
3040
return client
3141

3242
@override
@@ -39,10 +49,7 @@ async def open_key_value_store_client(
3949
) -> FileSystemKeyValueStoreClient:
4050
configuration = configuration or Configuration.get_global_configuration()
4151
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
42-
43-
if configuration.purge_on_start and client.metadata.name is None:
44-
await client.purge()
45-
52+
await self._purge_if_needed(client, configuration)
4653
return client
4754

4855
@override
@@ -55,8 +62,5 @@ async def open_request_queue_client(
5562
) -> FileSystemRequestQueueClient:
5663
configuration = configuration or Configuration.get_global_configuration()
5764
client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
58-
59-
if configuration.purge_on_start and client.metadata.name is None:
60-
await client.purge()
61-
65+
await self._purge_if_needed(client, configuration)
6266
return client

0 commit comments

Comments
 (0)