14
14
from crawlee .storage_clients .models import DatasetItemsListPage
15
15
16
16
17
- # Properties:
18
- # - id
19
- # - name
20
- # - created_at
21
- # - accessed_at
22
- # - modified_at
23
- # - item_count
24
-
25
- # Methods:
26
- # - open
27
- # - drop
28
- # - push_data
29
- # - get_data
30
- # - iterate
31
-
32
-
33
17
@docs_group ('Abstract classes' )
34
18
class DatasetClient (ABC ):
35
- """An abstract class for dataset resource clients.
19
+ """An abstract class for dataset storage clients.
20
+
21
+ Dataset clients provide an interface for accessing and manipulating dataset storage. They handle
22
+ operations like adding and getting dataset items across different storage backends.
36
23
37
- These clients are specific to the type of resource they manage and operate under a designated storage
38
- client, like a memory storage client.
24
+ Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,
25
+ `RequestQueue`), and can operate with various storage systems including memory, file system,
26
+ databases, and cloud storage solutions.
27
+
28
+ This abstract class defines the interface that all specific dataset clients must implement.
39
29
"""
40
30
41
31
@property
42
32
@abstractmethod
43
33
def id (self ) -> str :
44
- """The ID of the dataset ."""
34
+ """The ID of the dataet, a unique identifier, typically a UUID or similar value ."""
45
35
46
36
@property
47
37
@abstractmethod
48
38
def name (self ) -> str | None :
49
- """The name of the dataset."""
39
+ """The optional human-readable name of the dataset."""
50
40
51
41
@property
52
42
@abstractmethod
53
43
def created_at (self ) -> datetime :
54
- """The time at which the dataset was created."""
44
+ """Timestamp when the dataset was first created, remains unchanged ."""
55
45
56
46
@property
57
47
@abstractmethod
58
48
def accessed_at (self ) -> datetime :
59
- """The time at which the dataset was last accessed ."""
49
+ """Timestamp of last access to the dataset, updated on read or write operations ."""
60
50
61
51
@property
62
52
@abstractmethod
63
53
def modified_at (self ) -> datetime :
64
- """The time at which the dataset was last modified ."""
54
+ """Timestamp of last modification of the dataset, updated when new data are added ."""
65
55
66
56
@property
67
57
@abstractmethod
68
58
def item_count (self ) -> int :
69
- """The number of items in the dataset."""
59
+ """Total count of data items stored in the dataset."""
70
60
71
61
@classmethod
72
62
@abstractmethod
73
63
async def open (
74
64
cls ,
75
65
* ,
76
- id : str | None ,
77
- name : str | None ,
78
- storage_dir : Path ,
66
+ id : str | None = None ,
67
+ name : str | None = None ,
68
+ storage_dir : Path | None = None ,
79
69
) -> DatasetClient :
80
70
"""Open existing or create a new dataset client.
81
71
82
- If a dataset with the given name already exists, the appropriate dataset client is returned.
72
+ If a dataset with the given name or ID already exists, the appropriate dataset client is returned.
83
73
Otherwise, a new dataset is created and client for it is returned.
84
74
75
+ The backend method for the `Dataset.open` call.
76
+
85
77
Args:
86
- id: The ID of the dataset.
87
- name: The name of the dataset.
88
- storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
78
+ id: The ID of the dataset. If not provided, an ID may be generated.
79
+ name: The name of the dataset. If not provided a default name may be used.
80
+ storage_dir: The path to the storage directory. If the client persists data,
81
+ it should use this directory. May be ignored by non-persistent implementations.
89
82
90
83
Returns:
91
- A dataset client.
84
+ A dataset client instance .
92
85
"""
93
86
94
87
@abstractmethod
@@ -99,7 +92,7 @@ async def drop(self) -> None:
99
92
"""
100
93
101
94
@abstractmethod
102
- async def push_data (self , * , data : list [Any ] | dict [str , Any ]) -> None :
95
+ async def push_data (self , data : list [Any ] | dict [str , Any ]) -> None :
103
96
"""Push data to the dataset.
104
97
105
98
The backend method for the `Dataset.push_data` call.
@@ -121,13 +114,13 @@ async def get_data(
121
114
flatten : list [str ] | None = None ,
122
115
view : str | None = None ,
123
116
) -> DatasetItemsListPage :
124
- """Get data from the dataset.
117
+ """Get data from the dataset with various filtering options .
125
118
126
119
The backend method for the `Dataset.get_data` call.
127
120
"""
128
121
129
122
@abstractmethod
130
- async def iterate (
123
+ async def iterate_items (
131
124
self ,
132
125
* ,
133
126
offset : int = 0 ,
@@ -140,9 +133,9 @@ async def iterate(
140
133
skip_empty : bool = False ,
141
134
skip_hidden : bool = False ,
142
135
) -> AsyncIterator [dict ]:
143
- """Iterate over the dataset.
136
+ """Iterate over the dataset items with filtering options .
144
137
145
- The backend method for the `Dataset.iterate ` call.
138
+ The backend method for the `Dataset.iterate_items ` call.
146
139
"""
147
140
# This syntax is to make mypy properly work with abstract AsyncIterator.
148
141
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
0 commit comments