7
7
8
8
if TYPE_CHECKING :
9
9
from collections .abc import AsyncIterator
10
- from contextlib import AbstractAsyncContextManager
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any
11
13
12
- from httpx import Response
13
-
14
- from crawlee ._types import JsonSerializable
15
- from crawlee .storage_clients .models import DatasetItemsListPage , DatasetMetadata
14
+ from crawlee .storage_clients .models import DatasetItemsListPage
16
15
17
16
18
17
@docs_group ('Abstract classes' )
@@ -23,35 +22,78 @@ class DatasetClient(ABC):
23
22
client, like a memory storage client.
24
23
"""
25
24
26
- _LIST_ITEMS_LIMIT = 999_999_999_999
27
- """This is what API returns in the x-apify-pagination-limit header when no limit query parameter is used."""
25
+ @property
26
+ @abstractmethod
27
+ def id (self ) -> str :
28
+ """The ID of the dataset."""
29
+
30
+ @property
31
+ @abstractmethod
32
+ def name (self ) -> str | None :
33
+ """The name of the dataset."""
34
+
35
+ @property
36
+ @abstractmethod
37
+ def created_at (self ) -> datetime :
38
+ """The time at which the dataset was created."""
39
+
40
+ @property
41
+ @abstractmethod
42
+ def accessed_at (self ) -> datetime :
43
+ """The time at which the dataset was last accessed."""
28
44
45
+ @property
29
46
@abstractmethod
30
- async def get (self ) -> DatasetMetadata | None :
31
- """Get metadata about the dataset being managed by this client.
47
+ def modified_at (self ) -> datetime :
48
+ """The time at which the dataset was last modified."""
49
+
50
+ @property
51
+ @abstractmethod
52
+ def item_count (self ) -> int :
53
+ """The number of items in the dataset."""
54
+
55
+ @classmethod
56
+ @abstractmethod
57
+ async def open (
58
+ cls ,
59
+ id : str | None ,
60
+ name : str | None ,
61
+ storage_dir : Path ,
62
+ ) -> DatasetClient :
63
+ """Open existing or create a new dataset client.
64
+
65
+ If a dataset with the given name already exists, the appropriate dataset client is returned.
66
+ Otherwise, a new dataset is created and client for it is returned.
67
+
68
+ Args:
69
+ id: The ID of the dataset.
70
+ name: The name of the dataset.
71
+ storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
32
72
33
73
Returns:
34
- An object containing the dataset's details, or None if the dataset does not exist .
74
+ A dataset client .
35
75
"""
36
76
37
77
@abstractmethod
38
- async def delete (self ) -> None :
39
- """Permanently delete the dataset managed by this client."""
78
+ async def drop (self ) -> None :
79
+ """Drop the whole dataset and remove all its items.
80
+
81
+ The backend method for the `Dataset.drop` call.
82
+ """
40
83
41
84
@abstractmethod
42
- async def push_items (self , items : JsonSerializable ) -> None :
43
- """Push items to the dataset.
85
+ async def push_data (self , data : list [ Any ] | dict [ str , Any ] ) -> None :
86
+ """Push data to the dataset.
44
87
45
- Args:
46
- items: The items which to push in the dataset. They must be JSON serializable.
88
+ The backend method for the `Dataset.push_data` call.
47
89
"""
48
90
49
91
@abstractmethod
50
- async def list_items (
92
+ async def get_data (
51
93
self ,
52
94
* ,
53
- offset : int | None = 0 ,
54
- limit : int | None = _LIST_ITEMS_LIMIT ,
95
+ offset : int = 0 ,
96
+ limit : int | None = 999_999_999_999 ,
55
97
clean : bool = False ,
56
98
desc : bool = False ,
57
99
fields : list [str ] | None = None ,
@@ -62,31 +104,13 @@ async def list_items(
62
104
flatten : list [str ] | None = None ,
63
105
view : str | None = None ,
64
106
) -> DatasetItemsListPage :
65
- """Retrieve a paginated list of items from a dataset based on various filtering parameters.
66
-
67
- This method provides the flexibility to filter, sort, and modify the appearance of dataset items
68
- when listed. Each parameter modifies the result set according to its purpose. The method also
69
- supports pagination through 'offset' and 'limit' parameters.
70
-
71
- Args:
72
- offset: The number of initial items to skip.
73
- limit: The maximum number of items to return.
74
- clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
75
- desc: If True, items are returned in descending order, i.e., newest first.
76
- fields: Specifies a subset of fields to include in each item.
77
- omit: Specifies a subset of fields to exclude from each item.
78
- unwind: Specifies a field that should be unwound. If it's an array, each element becomes a separate record.
79
- skip_empty: If True, omits items that are empty after other filters have been applied.
80
- skip_hidden: If True, omits fields starting with the '#' character.
81
- flatten: A list of fields to flatten in each item.
82
- view: The specific view of the dataset to use when retrieving items.
107
+ """Get data from the dataset.
83
108
84
- Returns:
85
- An object with filtered, sorted, and paginated dataset items plus pagination details.
109
+ The backend method for the `Dataset.get_data` call.
86
110
"""
87
111
88
112
@abstractmethod
89
- async def iterate_items (
113
+ async def iterate (
90
114
self ,
91
115
* ,
92
116
offset : int = 0 ,
@@ -99,118 +123,12 @@ async def iterate_items(
99
123
skip_empty : bool = False ,
100
124
skip_hidden : bool = False ,
101
125
) -> AsyncIterator [dict ]:
102
- """Iterate over items in the dataset according to specified filters and sorting.
103
-
104
- This method allows for asynchronously iterating through dataset items while applying various filters such as
105
- skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`
106
- parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and
107
- `skip_hidden` parameters.
126
+ """Iterate over the dataset.
108
127
109
- Args:
110
- offset: The number of initial items to skip.
111
- limit: The maximum number of items to iterate over. None means no limit.
112
- clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
113
- desc: If set to True, items are returned in descending order, i.e., newest first.
114
- fields: Specifies a subset of fields to include in each item.
115
- omit: Specifies a subset of fields to exclude from each item.
116
- unwind: Specifies a field that should be unwound into separate items.
117
- skip_empty: If set to True, omits items that are empty after other filters have been applied.
118
- skip_hidden: If set to True, omits fields starting with the '#' character from the output.
119
-
120
- Yields:
121
- An asynchronous iterator of dictionary objects, each representing a dataset item after applying
122
- the specified filters and transformations.
128
+ The backend method for the `Dataset.iterate` call.
123
129
"""
124
130
# This syntax is to make mypy properly work with abstract AsyncIterator.
125
131
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
126
132
raise NotImplementedError
127
133
if False : # type: ignore[unreachable]
128
134
yield 0
129
-
130
- @abstractmethod
131
- async def get_items_as_bytes (
132
- self ,
133
- * ,
134
- item_format : str = 'json' ,
135
- offset : int | None = None ,
136
- limit : int | None = None ,
137
- desc : bool = False ,
138
- clean : bool = False ,
139
- bom : bool = False ,
140
- delimiter : str | None = None ,
141
- fields : list [str ] | None = None ,
142
- omit : list [str ] | None = None ,
143
- unwind : str | None = None ,
144
- skip_empty : bool = False ,
145
- skip_header_row : bool = False ,
146
- skip_hidden : bool = False ,
147
- xml_root : str | None = None ,
148
- xml_row : str | None = None ,
149
- flatten : list [str ] | None = None ,
150
- ) -> bytes :
151
- """Retrieve dataset items as bytes.
152
-
153
- Args:
154
- item_format: Output format (e.g., 'json', 'csv'); default is 'json'.
155
- offset: Number of items to skip; default is 0.
156
- limit: Max number of items to return; no default limit.
157
- desc: If True, results are returned in descending order.
158
- clean: If True, filters out empty items and hidden fields.
159
- bom: Include or exclude UTF-8 BOM; default behavior varies by format.
160
- delimiter: Delimiter character for CSV; default is ','.
161
- fields: List of fields to include in the results.
162
- omit: List of fields to omit from the results.
163
- unwind: Unwinds a field into separate records.
164
- skip_empty: If True, skips empty items in the output.
165
- skip_header_row: If True, skips the header row in CSV.
166
- skip_hidden: If True, skips hidden fields in the output.
167
- xml_root: Root element name for XML output; default is 'items'.
168
- xml_row: Element name for each item in XML output; default is 'item'.
169
- flatten: List of fields to flatten.
170
-
171
- Returns:
172
- The dataset items as raw bytes.
173
- """
174
-
175
- @abstractmethod
176
- async def stream_items (
177
- self ,
178
- * ,
179
- item_format : str = 'json' ,
180
- offset : int | None = None ,
181
- limit : int | None = None ,
182
- desc : bool = False ,
183
- clean : bool = False ,
184
- bom : bool = False ,
185
- delimiter : str | None = None ,
186
- fields : list [str ] | None = None ,
187
- omit : list [str ] | None = None ,
188
- unwind : str | None = None ,
189
- skip_empty : bool = False ,
190
- skip_header_row : bool = False ,
191
- skip_hidden : bool = False ,
192
- xml_root : str | None = None ,
193
- xml_row : str | None = None ,
194
- ) -> AbstractAsyncContextManager [Response | None ]:
195
- """Retrieve dataset items as a streaming response.
196
-
197
- Args:
198
- item_format: Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json.
199
- offset: Number of items to skip at the start; default is 0.
200
- limit: Maximum number of items to return; no default limit.
201
- desc: If True, reverses the order of results.
202
- clean: If True, filters out empty items and hidden fields.
203
- bom: Include or exclude UTF-8 BOM; varies by format.
204
- delimiter: Delimiter for CSV files; default is ','.
205
- fields: List of fields to include in the output.
206
- omit: List of fields to omit from the output.
207
- unwind: Unwinds a field into separate records.
208
- skip_empty: If True, empty items are omitted.
209
- skip_header_row: If True, skips the header row in CSV.
210
- skip_hidden: If True, hides fields starting with the # character.
211
- xml_root: Custom root element name for XML output; default is 'items'.
212
- xml_row: Custom element name for each item in XML; default is 'item'.
213
-
214
- Yields:
215
- The dataset items in a streaming response.
216
- """
0 commit comments