Skip to content

Commit 86380e4

Browse files
committed
Memory storage clients and their tests
1 parent da22ef5 commit 86380e4

File tree

15 files changed

+1035
-260
lines changed

15 files changed

+1035
-260
lines changed

src/crawlee/_utils/file.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,31 @@ async def json_dumps(obj: Any) -> str:
103103
return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str)
104104

105105

106+
def infer_mime_type(value: Any) -> str:
107+
"""Infer the MIME content type from the value.
108+
109+
Args:
110+
value: The value to infer the content type from.
111+
112+
Returns:
113+
The inferred MIME content type.
114+
"""
115+
# If the value is bytes (or bytearray), return binary content type.
116+
if isinstance(value, (bytes, bytearray)):
117+
return 'application/octet-stream'
118+
119+
# If the value is a dict or list, assume JSON.
120+
if isinstance(value, (dict, list)):
121+
return 'application/json; charset=utf-8'
122+
123+
# If the value is a string, assume plain text.
124+
if isinstance(value, str):
125+
return 'text/plain; charset=utf-8'
126+
127+
# Default fallback.
128+
return 'application/octet-stream'
129+
130+
106131
async def export_json_to_stream(
107132
iterator: AsyncIterator[dict],
108133
dst: TextIO,

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -714,9 +714,9 @@ async def export_data(
714714
dst = path.open('w', newline='')
715715

716716
if path.suffix == '.csv':
717-
await export_csv_to_stream(dataset.iterate(), dst)
717+
await export_csv_to_stream(dataset.iterate_items(), dst)
718718
elif path.suffix == '.json':
719-
await export_json_to_stream(dataset.iterate(), dst)
719+
await export_json_to_stream(dataset.iterate_items(), dst)
720720
else:
721721
raise ValueError(f'Unsupported file extension: {path.suffix}')
722722

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,81 +14,74 @@
1414
from crawlee.storage_clients.models import DatasetItemsListPage
1515

1616

17-
# Properties:
18-
# - id
19-
# - name
20-
# - created_at
21-
# - accessed_at
22-
# - modified_at
23-
# - item_count
24-
25-
# Methods:
26-
# - open
27-
# - drop
28-
# - push_data
29-
# - get_data
30-
# - iterate
31-
32-
3317
@docs_group('Abstract classes')
3418
class DatasetClient(ABC):
35-
"""An abstract class for dataset resource clients.
19+
"""An abstract class for dataset storage clients.
20+
21+
Dataset clients provide an interface for accessing and manipulating dataset storage. They handle
22+
operations like adding and getting dataset items across different storage backends.
3623
37-
These clients are specific to the type of resource they manage and operate under a designated storage
38-
client, like a memory storage client.
24+
Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,
25+
`RequestQueue`), and can operate with various storage systems including memory, file system,
26+
databases, and cloud storage solutions.
27+
28+
This abstract class defines the interface that all specific dataset clients must implement.
3929
"""
4030

4131
@property
4232
@abstractmethod
4333
def id(self) -> str:
44-
"""The ID of the dataset."""
34+
"""The ID of the dataet, a unique identifier, typically a UUID or similar value."""
4535

4636
@property
4737
@abstractmethod
4838
def name(self) -> str | None:
49-
"""The name of the dataset."""
39+
"""The optional human-readable name of the dataset."""
5040

5141
@property
5242
@abstractmethod
5343
def created_at(self) -> datetime:
54-
"""The time at which the dataset was created."""
44+
"""Timestamp when the dataset was first created, remains unchanged."""
5545

5646
@property
5747
@abstractmethod
5848
def accessed_at(self) -> datetime:
59-
"""The time at which the dataset was last accessed."""
49+
"""Timestamp of last access to the dataset, updated on read or write operations."""
6050

6151
@property
6252
@abstractmethod
6353
def modified_at(self) -> datetime:
64-
"""The time at which the dataset was last modified."""
54+
"""Timestamp of last modification of the dataset, updated when new data are added."""
6555

6656
@property
6757
@abstractmethod
6858
def item_count(self) -> int:
69-
"""The number of items in the dataset."""
59+
"""Total count of data items stored in the dataset."""
7060

7161
@classmethod
7262
@abstractmethod
7363
async def open(
7464
cls,
7565
*,
76-
id: str | None,
77-
name: str | None,
78-
storage_dir: Path,
66+
id: str | None = None,
67+
name: str | None = None,
68+
storage_dir: Path | None = None,
7969
) -> DatasetClient:
8070
"""Open existing or create a new dataset client.
8171
82-
If a dataset with the given name already exists, the appropriate dataset client is returned.
72+
If a dataset with the given name or ID already exists, the appropriate dataset client is returned.
8373
Otherwise, a new dataset is created and client for it is returned.
8474
75+
The backend method for the `Dataset.open` call.
76+
8577
Args:
86-
id: The ID of the dataset.
87-
name: The name of the dataset.
88-
storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
78+
id: The ID of the dataset. If not provided, an ID may be generated.
79+
name: The name of the dataset. If not provided a default name may be used.
80+
storage_dir: The path to the storage directory. If the client persists data,
81+
it should use this directory. May be ignored by non-persistent implementations.
8982
9083
Returns:
91-
A dataset client.
84+
A dataset client instance.
9285
"""
9386

9487
@abstractmethod
@@ -99,7 +92,7 @@ async def drop(self) -> None:
9992
"""
10093

10194
@abstractmethod
102-
async def push_data(self, *, data: list[Any] | dict[str, Any]) -> None:
95+
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
10396
"""Push data to the dataset.
10497
10598
The backend method for the `Dataset.push_data` call.
@@ -121,13 +114,13 @@ async def get_data(
121114
flatten: list[str] | None = None,
122115
view: str | None = None,
123116
) -> DatasetItemsListPage:
124-
"""Get data from the dataset.
117+
"""Get data from the dataset with various filtering options.
125118
126119
The backend method for the `Dataset.get_data` call.
127120
"""
128121

129122
@abstractmethod
130-
async def iterate(
123+
async def iterate_items(
131124
self,
132125
*,
133126
offset: int = 0,
@@ -140,9 +133,9 @@ async def iterate(
140133
skip_empty: bool = False,
141134
skip_hidden: bool = False,
142135
) -> AsyncIterator[dict]:
143-
"""Iterate over the dataset.
136+
"""Iterate over the dataset items with filtering options.
144137
145-
The backend method for the `Dataset.iterate` call.
138+
The backend method for the `Dataset.iterate_items` call.
146139
"""
147140
# This syntax is to make mypy properly work with abstract AsyncIterator.
148141
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators

src/crawlee/storage_clients/_base/_key_value_store_client.py

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,82 +10,73 @@
1010
from datetime import datetime
1111
from pathlib import Path
1212

13-
from crawlee.storage_clients.models import (
14-
KeyValueStoreRecord,
15-
KeyValueStoreRecordMetadata,
16-
)
17-
18-
# Properties:
19-
# - id
20-
# - name
21-
# - created_at
22-
# - accessed_at
23-
# - modified_at
24-
25-
# Methods:
26-
# - open
27-
# - drop
28-
# - get_value
29-
# - set_value
30-
# - delete_value
31-
# - iterate_keys
32-
# - get_public_url
13+
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
3314

3415

3516
@docs_group('Abstract classes')
3617
class KeyValueStoreClient(ABC):
37-
"""An abstract class for key-value store (KVS) resource clients.
18+
"""An abstract class for key-value store (KVS) storage clients.
3819
39-
These clients are specific to the type of resource they manage and operate under a designated storage
40-
client, like a memory storage client.
20+
Key-value stores clients provide an interface for accessing and manipulating KVS storage. They handle
21+
operations like getting, setting, deleting KVS values across different storage backends.
22+
23+
Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,
24+
`RequestQueue`), and can operate with various storage systems including memory, file system,
25+
databases, and cloud storage solutions.
26+
27+
This abstract class defines the interface that all specific KVS clients must implement.
4128
"""
4229

4330
@property
4431
@abstractmethod
4532
def id(self) -> str:
46-
"""The ID of the key-value store."""
33+
"""The unique identifier of the key-value store (typically a UUID)."""
4734

4835
@property
4936
@abstractmethod
5037
def name(self) -> str | None:
51-
"""The name of the key-value store."""
38+
"""The optional human-readable name for the KVS."""
5239

5340
@property
5441
@abstractmethod
5542
def created_at(self) -> datetime:
56-
"""The time at which the key-value store was created."""
43+
"""Timestamp when the KVS was first created, remains unchanged."""
5744

5845
@property
5946
@abstractmethod
6047
def accessed_at(self) -> datetime:
61-
"""The time at which the key-value store was last accessed."""
48+
"""Timestamp of last access to the KVS, updated on read or write operations."""
6249

6350
@property
6451
@abstractmethod
6552
def modified_at(self) -> datetime:
66-
"""The time at which the key-value store was last modified."""
53+
"""Timestamp of last modification of the KVS, updated when new data are added, updated or deleted."""
6754

6855
@classmethod
6956
@abstractmethod
7057
async def open(
7158
cls,
7259
*,
73-
id: str | None,
74-
name: str | None,
75-
storage_dir: Path,
60+
id: str | None = None,
61+
name: str | None = None,
62+
storage_dir: Path | None = None,
7663
) -> KeyValueStoreClient:
7764
"""Open existing or create a new key-value store client.
7865
79-
If a key-value store with the given name already exists, the appropriate key-value store client is returned.
80-
Otherwise, a new key-value store is created and client for it is returned.
66+
If a key-value store with the given name or ID already exists, the appropriate
67+
key-value store client is returned. Otherwise, a new key-value store is created
68+
and a client for it is returned.
69+
70+
The backend method for the `KeyValueStoreClient.open` call.
8171
8272
Args:
83-
id: The ID of the key-value store.
84-
name: The name of the key-value store.
85-
storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
73+
id: The ID of the key-value store. If not provided, an ID may be generated.
74+
name: The name of the key-value store. If not provided a default name may be used.
75+
storage_dir: The path to the storage directory. If the client persists data,
76+
it should use this directory. May be ignored by non-persistent implementations.
8677
8778
Returns:
88-
A key-value store client.
79+
A key-value store client instance.
8980
"""
9081

9182
@abstractmethod

0 commit comments

Comments
 (0)