Skip to content

Commit fecae5c

Browse files
committed
Update of dataset and its clients
1 parent 87772f5 commit fecae5c

30 files changed

+1052
-2540
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ ignore = [
142142
"PLR0911", # Too many return statements
143143
"PLR0913", # Too many arguments in function definition
144144
"PLR0915", # Too many statements
145-
"PTH", # flake8-use-pathlib
146145
"PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
147146
"PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
148147
"S102", # Use of `exec` detected

src/crawlee/_service_locator.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,9 @@ def set_event_manager(self, event_manager: EventManager) -> None:
7777
def get_storage_client(self) -> StorageClient:
7878
"""Get the storage client."""
7979
if self._storage_client is None:
80-
from crawlee.storage_clients import MemoryStorageClient
80+
from crawlee.storage_clients import file_system_storage_client
8181

82-
self._storage_client = (
83-
MemoryStorageClient.from_config(config=self._configuration)
84-
if self._configuration
85-
else MemoryStorageClient.from_config()
86-
)
82+
self._storage_client = file_system_storage_client
8783

8884
self._storage_client_was_retrieved = True
8985
return self._storage_client

src/crawlee/_types.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,6 @@ async def push_data(
274274
**kwargs: Unpack[PushDataKwargs],
275275
) -> None:
276276
"""Track a call to the `push_data` context helper."""
277-
from crawlee.storages._dataset import Dataset
278-
279-
await Dataset.check_and_serialize(data)
280-
281277
self.push_data_calls.append(
282278
PushDataFunctionCall(
283279
data=data,

src/crawlee/fingerprint_suite/_browserforge_adapter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from __future__ import annotations
22

3-
import os.path
43
from collections.abc import Iterable
54
from copy import deepcopy
65
from functools import reduce
76
from operator import or_
7+
from pathlib import Path
88
from typing import TYPE_CHECKING, Any, Literal
99

1010
from browserforge.bayesian_network import extract_json
@@ -253,9 +253,9 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str,
253253

254254
def get_available_header_network() -> dict:
255255
"""Get header network that contains possible header values."""
256-
if os.path.isfile(DATA_DIR / 'header-network.zip'):
256+
if Path(DATA_DIR / 'header-network.zip').is_file():
257257
return extract_json(DATA_DIR / 'header-network.zip')
258-
if os.path.isfile(DATA_DIR / 'header-network-definition.zip'):
258+
if Path(DATA_DIR / 'header-network-definition.zip').is_file():
259259
return extract_json(DATA_DIR / 'header-network-definition.zip')
260260
raise FileNotFoundError('Missing header-network file.')
261261

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
from ._base import StorageClient
2-
from ._memory import MemoryStorageClient
2+
from ._file_system import file_system_storage_client
3+
from ._memory import memory_storage_client
34

4-
__all__ = ['MemoryStorageClient', 'StorageClient']
5+
__all__ = [
6+
'StorageClient',
7+
'file_system_storage_client',
8+
'memory_storage_client'
9+
]

src/crawlee/storage_clients/_base/__init__.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,10 @@
22
from ._key_value_store_client import KeyValueStoreClient
33
from ._request_queue_client import RequestQueueClient
44
from ._storage_client import StorageClient
5-
from ._types import ResourceClient
65

76
__all__ = [
87
'DatasetClient',
9-
'DatasetCollectionClient',
108
'KeyValueStoreClient',
11-
'KeyValueStoreCollectionClient',
129
'RequestQueueClient',
13-
'RequestQueueCollectionClient',
14-
'ResourceClient',
1510
'StorageClient',
1611
]

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 66 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import AsyncIterator
10-
from contextlib import AbstractAsyncContextManager
10+
from datetime import datetime
11+
from pathlib import Path
12+
from typing import Any
1113

12-
from httpx import Response
13-
14-
from crawlee._types import JsonSerializable
15-
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
14+
from crawlee.storage_clients.models import DatasetItemsListPage
1615

1716

1817
@docs_group('Abstract classes')
@@ -23,35 +22,78 @@ class DatasetClient(ABC):
2322
client, like a memory storage client.
2423
"""
2524

26-
_LIST_ITEMS_LIMIT = 999_999_999_999
27-
"""This is what API returns in the x-apify-pagination-limit header when no limit query parameter is used."""
25+
@property
26+
@abstractmethod
27+
def id(self) -> str:
28+
"""The ID of the dataset."""
29+
30+
@property
31+
@abstractmethod
32+
def name(self) -> str | None:
33+
"""The name of the dataset."""
34+
35+
@property
36+
@abstractmethod
37+
def created_at(self) -> datetime:
38+
"""The time at which the dataset was created."""
39+
40+
@property
41+
@abstractmethod
42+
def accessed_at(self) -> datetime:
43+
"""The time at which the dataset was last accessed."""
2844

45+
@property
2946
@abstractmethod
30-
async def get(self) -> DatasetMetadata | None:
31-
"""Get metadata about the dataset being managed by this client.
47+
def modified_at(self) -> datetime:
48+
"""The time at which the dataset was last modified."""
49+
50+
@property
51+
@abstractmethod
52+
def item_count(self) -> int:
53+
"""The number of items in the dataset."""
54+
55+
@classmethod
56+
@abstractmethod
57+
async def open(
58+
cls,
59+
id: str | None,
60+
name: str | None,
61+
storage_dir: Path,
62+
) -> DatasetClient:
63+
"""Open existing or create a new dataset client.
64+
65+
If a dataset with the given name already exists, the appropriate dataset client is returned.
66+
Otherwise, a new dataset is created and client for it is returned.
67+
68+
Args:
69+
id: The ID of the dataset.
70+
name: The name of the dataset.
71+
storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
3272
3373
Returns:
34-
An object containing the dataset's details, or None if the dataset does not exist.
74+
A dataset client.
3575
"""
3676

3777
@abstractmethod
38-
async def delete(self) -> None:
39-
"""Permanently delete the dataset managed by this client."""
78+
async def drop(self) -> None:
79+
"""Drop the whole dataset and remove all its items.
80+
81+
The backend method for the `Dataset.drop` call.
82+
"""
4083

4184
@abstractmethod
42-
async def push_items(self, items: JsonSerializable) -> None:
43-
"""Push items to the dataset.
85+
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
86+
"""Push data to the dataset.
4487
45-
Args:
46-
items: The items which to push in the dataset. They must be JSON serializable.
88+
The backend method for the `Dataset.push_data` call.
4789
"""
4890

4991
@abstractmethod
50-
async def list_items(
92+
async def get_data(
5193
self,
5294
*,
53-
offset: int | None = 0,
54-
limit: int | None = _LIST_ITEMS_LIMIT,
95+
offset: int = 0,
96+
limit: int | None = 999_999_999_999,
5597
clean: bool = False,
5698
desc: bool = False,
5799
fields: list[str] | None = None,
@@ -62,31 +104,13 @@ async def list_items(
62104
flatten: list[str] | None = None,
63105
view: str | None = None,
64106
) -> DatasetItemsListPage:
65-
"""Retrieve a paginated list of items from a dataset based on various filtering parameters.
66-
67-
This method provides the flexibility to filter, sort, and modify the appearance of dataset items
68-
when listed. Each parameter modifies the result set according to its purpose. The method also
69-
supports pagination through 'offset' and 'limit' parameters.
70-
71-
Args:
72-
offset: The number of initial items to skip.
73-
limit: The maximum number of items to return.
74-
clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
75-
desc: If True, items are returned in descending order, i.e., newest first.
76-
fields: Specifies a subset of fields to include in each item.
77-
omit: Specifies a subset of fields to exclude from each item.
78-
unwind: Specifies a field that should be unwound. If it's an array, each element becomes a separate record.
79-
skip_empty: If True, omits items that are empty after other filters have been applied.
80-
skip_hidden: If True, omits fields starting with the '#' character.
81-
flatten: A list of fields to flatten in each item.
82-
view: The specific view of the dataset to use when retrieving items.
107+
"""Get data from the dataset.
83108
84-
Returns:
85-
An object with filtered, sorted, and paginated dataset items plus pagination details.
109+
The backend method for the `Dataset.get_data` call.
86110
"""
87111

88112
@abstractmethod
89-
async def iterate_items(
113+
async def iterate(
90114
self,
91115
*,
92116
offset: int = 0,
@@ -99,118 +123,12 @@ async def iterate_items(
99123
skip_empty: bool = False,
100124
skip_hidden: bool = False,
101125
) -> AsyncIterator[dict]:
102-
"""Iterate over items in the dataset according to specified filters and sorting.
103-
104-
This method allows for asynchronously iterating through dataset items while applying various filters such as
105-
skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`
106-
parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and
107-
`skip_hidden` parameters.
126+
"""Iterate over the dataset.
108127
109-
Args:
110-
offset: The number of initial items to skip.
111-
limit: The maximum number of items to iterate over. None means no limit.
112-
clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
113-
desc: If set to True, items are returned in descending order, i.e., newest first.
114-
fields: Specifies a subset of fields to include in each item.
115-
omit: Specifies a subset of fields to exclude from each item.
116-
unwind: Specifies a field that should be unwound into separate items.
117-
skip_empty: If set to True, omits items that are empty after other filters have been applied.
118-
skip_hidden: If set to True, omits fields starting with the '#' character from the output.
119-
120-
Yields:
121-
An asynchronous iterator of dictionary objects, each representing a dataset item after applying
122-
the specified filters and transformations.
128+
The backend method for the `Dataset.iterate` call.
123129
"""
124130
# This syntax is to make mypy properly work with abstract AsyncIterator.
125131
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
126132
raise NotImplementedError
127133
if False: # type: ignore[unreachable]
128134
yield 0
129-
130-
@abstractmethod
131-
async def get_items_as_bytes(
132-
self,
133-
*,
134-
item_format: str = 'json',
135-
offset: int | None = None,
136-
limit: int | None = None,
137-
desc: bool = False,
138-
clean: bool = False,
139-
bom: bool = False,
140-
delimiter: str | None = None,
141-
fields: list[str] | None = None,
142-
omit: list[str] | None = None,
143-
unwind: str | None = None,
144-
skip_empty: bool = False,
145-
skip_header_row: bool = False,
146-
skip_hidden: bool = False,
147-
xml_root: str | None = None,
148-
xml_row: str | None = None,
149-
flatten: list[str] | None = None,
150-
) -> bytes:
151-
"""Retrieve dataset items as bytes.
152-
153-
Args:
154-
item_format: Output format (e.g., 'json', 'csv'); default is 'json'.
155-
offset: Number of items to skip; default is 0.
156-
limit: Max number of items to return; no default limit.
157-
desc: If True, results are returned in descending order.
158-
clean: If True, filters out empty items and hidden fields.
159-
bom: Include or exclude UTF-8 BOM; default behavior varies by format.
160-
delimiter: Delimiter character for CSV; default is ','.
161-
fields: List of fields to include in the results.
162-
omit: List of fields to omit from the results.
163-
unwind: Unwinds a field into separate records.
164-
skip_empty: If True, skips empty items in the output.
165-
skip_header_row: If True, skips the header row in CSV.
166-
skip_hidden: If True, skips hidden fields in the output.
167-
xml_root: Root element name for XML output; default is 'items'.
168-
xml_row: Element name for each item in XML output; default is 'item'.
169-
flatten: List of fields to flatten.
170-
171-
Returns:
172-
The dataset items as raw bytes.
173-
"""
174-
175-
@abstractmethod
176-
async def stream_items(
177-
self,
178-
*,
179-
item_format: str = 'json',
180-
offset: int | None = None,
181-
limit: int | None = None,
182-
desc: bool = False,
183-
clean: bool = False,
184-
bom: bool = False,
185-
delimiter: str | None = None,
186-
fields: list[str] | None = None,
187-
omit: list[str] | None = None,
188-
unwind: str | None = None,
189-
skip_empty: bool = False,
190-
skip_header_row: bool = False,
191-
skip_hidden: bool = False,
192-
xml_root: str | None = None,
193-
xml_row: str | None = None,
194-
) -> AbstractAsyncContextManager[Response | None]:
195-
"""Retrieve dataset items as a streaming response.
196-
197-
Args:
198-
item_format: Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json.
199-
offset: Number of items to skip at the start; default is 0.
200-
limit: Maximum number of items to return; no default limit.
201-
desc: If True, reverses the order of results.
202-
clean: If True, filters out empty items and hidden fields.
203-
bom: Include or exclude UTF-8 BOM; varies by format.
204-
delimiter: Delimiter for CSV files; default is ','.
205-
fields: List of fields to include in the output.
206-
omit: List of fields to omit from the output.
207-
unwind: Unwinds a field into separate records.
208-
skip_empty: If True, empty items are omitted.
209-
skip_header_row: If True, skips the header row in CSV.
210-
skip_hidden: If True, hides fields starting with the # character.
211-
xml_root: Custom root element name for XML output; default is 'items'.
212-
xml_row: Custom element name for each item in XML; default is 'item'.
213-
214-
Yields:
215-
The dataset items in a streaming response.
216-
"""

0 commit comments

Comments
 (0)