8
8
from crawlee .storage_clients ._base import DatasetClient , KeyValueStoreClient , RequestQueueClient
9
9
10
10
if TYPE_CHECKING :
11
- from crawlee .storage_clients import StorageClient
12
-
13
11
from ._base import Storage
14
12
15
13
T = TypeVar ('T' , bound = 'Storage' )
16
14
17
15
18
16
@dataclass
19
- class _StorageClientCache :
20
- """Cache for specific storage client.
21
-
22
- Example:
23
- Storage=Dataset, id='123', additional_cache_key="some_path" will be located in
24
- storage = by_id[Dataset]['123'][some_path]
25
- """
17
+ class _StorageCache :
18
+ """Cache for storage instances."""
26
19
27
20
by_id : defaultdict [type [Storage ], defaultdict [str , defaultdict [Hashable , Storage ]]] = field (
28
21
default_factory = lambda : defaultdict (lambda : defaultdict (lambda : defaultdict ()))
29
22
)
30
- """Cache for storage instances by ID, separated by storage type and additional hash key ."""
23
+ """Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key'] ."""
31
24
32
25
by_name : defaultdict [type [Storage ], defaultdict [str , defaultdict [Hashable , Storage ]]] = field (
33
26
default_factory = lambda : defaultdict (lambda : defaultdict (lambda : defaultdict ()))
34
27
)
35
- """Cache for storage instances by name, separated by storage type and additional hash key. """
28
+ """Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key'] """
36
29
37
30
by_alias : defaultdict [type [Storage ], defaultdict [str , defaultdict [Hashable , Storage ]]] = field (
38
31
default_factory = lambda : defaultdict (lambda : defaultdict (lambda : defaultdict ()))
39
32
)
40
- """Cache for storage instances by alias, separated by storage type and additional hash key."""
33
+ """Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""
34
+
35
+ def remove_from_cache (self , storage_instance : Storage ) -> None :
36
+ """Remove a storage instance from the cache.
37
+
38
+ Args:
39
+ storage_instance: The storage instance to remove.
40
+ """
41
+ storage_type = type (storage_instance )
42
+
43
+ # Remove from ID cache
44
+ for additional_key in self .by_id [storage_type ][storage_instance .id ]:
45
+ del self .by_id [storage_type ][storage_instance .id ][additional_key ]
46
+ break
41
47
48
+ # Remove from name cache or alias cache. It can never be in both.
49
+ if storage_instance .name is not None :
50
+ for additional_key in self .by_name [storage_type ][storage_instance .name ]:
51
+ del self .by_name [storage_type ][storage_instance .name ][additional_key ]
52
+ break
53
+ else :
54
+ for alias_key in self .by_alias [storage_type ]:
55
+ for additional_key in self .by_alias [storage_type ][alias_key ]:
56
+ del self .by_alias [storage_type ][alias_key ][additional_key ]
57
+ break
42
58
43
- StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient
44
- """Type alias for the storage client types."""
45
59
46
- ClientOpenerCoro = Coroutine [None , None , StorageClientType ]
60
+ ClientOpenerCoro = Coroutine [None , None , DatasetClient | KeyValueStoreClient | RequestQueueClient ]
47
61
"""Type alias for the client opener function."""
48
62
49
63
@@ -58,7 +72,7 @@ class StorageInstanceManager:
58
72
"""Reserved alias for default unnamed storage."""
59
73
60
74
def __init__ (self ) -> None :
61
- self ._cache_by_storage_client : dict [ type [ StorageClient ], _StorageClientCache ] = defaultdict ( _StorageClientCache )
75
+ self ._cache : _StorageCache = _StorageCache ( )
62
76
63
77
async def open_storage_instance (
64
78
self ,
@@ -67,9 +81,8 @@ async def open_storage_instance(
67
81
id : str | None ,
68
82
name : str | None ,
69
83
alias : str | None ,
70
- storage_client_type : type [StorageClient ],
71
84
client_opener_coro : ClientOpenerCoro ,
72
- additional_cache_key : Hashable = '' ,
85
+ storage_client_cache_key : Hashable = '' ,
73
86
) -> T :
74
87
"""Open a storage instance with caching support.
75
88
@@ -78,9 +91,8 @@ async def open_storage_instance(
78
91
id: Storage ID.
79
92
name: Storage name. (global scope, persists across runs).
80
93
alias: Storage alias (run scope, creates unnamed storage).
81
- storage_client_type: Type of storage client to use.
82
94
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
83
- additional_cache_key : Additional optional key to differentiate cache entries.
95
+ storage_client_cache_key : Additional optional key from storage client to differentiate cache entries.
84
96
85
97
Returns:
86
98
The storage instance.
@@ -105,45 +117,31 @@ async def open_storage_instance(
105
117
alias = self ._DEFAULT_STORAGE_ALIAS
106
118
107
119
# Check cache
108
- if id is not None and (
109
- cached_instance := self ._cache_by_storage_client [storage_client_type ]
110
- .by_id [cls ][id ]
111
- .get (additional_cache_key )
112
- ):
120
+ if id is not None and (cached_instance := self ._cache .by_id [cls ][id ].get (storage_client_cache_key )):
113
121
if isinstance (cached_instance , cls ):
114
122
return cached_instance
115
123
raise RuntimeError ('Cached instance type mismatch.' )
116
124
117
- if name is not None and (
118
- cached_instance := self ._cache_by_storage_client [storage_client_type ]
119
- .by_name [cls ][name ]
120
- .get (additional_cache_key )
121
- ):
125
+ if name is not None and (cached_instance := self ._cache .by_name [cls ][name ].get (storage_client_cache_key )):
122
126
if isinstance (cached_instance , cls ):
123
127
return cached_instance
124
128
raise RuntimeError ('Cached instance type mismatch.' )
125
129
126
130
if alias is not None and (
127
- cached_instance := self ._cache_by_storage_client [storage_client_type ]
128
- .by_alias [cls ][alias ]
129
- .get (additional_cache_key )
131
+ cached_instance := self ._cache .by_alias [cls ][alias ].get (storage_client_cache_key )
130
132
):
131
133
if isinstance (cached_instance , cls ):
132
134
return cached_instance
133
135
raise RuntimeError ('Cached instance type mismatch.' )
134
136
135
137
# Check for conflicts between named and alias storages
136
- if alias and (
137
- self ._cache_by_storage_client [storage_client_type ].by_name [cls ][alias ].get (additional_cache_key )
138
- ):
138
+ if alias and (self ._cache .by_name [cls ][alias ].get (storage_client_cache_key )):
139
139
raise ValueError (
140
140
f'Cannot create alias storage "{ alias } " because a named storage with the same name already exists. '
141
141
f'Use a different alias or drop the existing named storage first.'
142
142
)
143
143
144
- if name and (
145
- self ._cache_by_storage_client [storage_client_type ].by_alias [cls ][name ].get (additional_cache_key )
146
- ):
144
+ if name and (self ._cache .by_alias [cls ][name ].get (storage_client_cache_key )):
147
145
raise ValueError (
148
146
f'Cannot create named storage "{ name } " because an alias storage with the same name already exists. '
149
147
f'Use a different name or drop the existing alias storage first.'
@@ -160,17 +158,15 @@ async def open_storage_instance(
160
158
161
159
# Cache the instance.
162
160
# Always cache by id.
163
- self ._cache_by_storage_client [ storage_client_type ] .by_id [cls ][instance .id ][additional_cache_key ] = instance
161
+ self ._cache .by_id [cls ][instance .id ][storage_client_cache_key ] = instance
164
162
165
163
# Cache named storage.
166
164
if instance_name is not None :
167
- self ._cache_by_storage_client [storage_client_type ].by_name [cls ][instance_name ][additional_cache_key ] = (
168
- instance
169
- )
165
+ self ._cache .by_name [cls ][instance_name ][storage_client_cache_key ] = instance
170
166
171
167
# Cache unnamed storage.
172
168
if alias is not None :
173
- self ._cache_by_storage_client [ storage_client_type ] .by_alias [cls ][alias ][additional_cache_key ] = instance
169
+ self ._cache .by_alias [cls ][alias ][storage_client_cache_key ] = instance
174
170
175
171
return instance
176
172
@@ -185,25 +181,8 @@ def remove_from_cache(self, storage_instance: Storage) -> None:
185
181
Args:
186
182
storage_instance: The storage instance to remove.
187
183
"""
188
- storage_type = type (storage_instance )
189
-
190
- for storage_client_cache in self ._cache_by_storage_client .values ():
191
- # Remove from ID cache
192
- for additional_key in storage_client_cache .by_id [storage_type ][storage_instance .id ]:
193
- del storage_client_cache .by_id [storage_type ][storage_instance .id ][additional_key ]
194
- break
195
-
196
- # Remove from name cache or alias cache. It can never be in both.
197
- if storage_instance .name is not None :
198
- for additional_key in storage_client_cache .by_name [storage_type ][storage_instance .name ]:
199
- del storage_client_cache .by_name [storage_type ][storage_instance .name ][additional_key ]
200
- break
201
- else :
202
- for alias_key in storage_client_cache .by_alias [storage_type ]:
203
- for additional_key in storage_client_cache .by_alias [storage_type ][alias_key ]:
204
- del storage_client_cache .by_alias [storage_type ][alias_key ][additional_key ]
205
- break
184
+ self ._cache .remove_from_cache (storage_instance )
206
185
207
186
def clear_cache (self ) -> None :
208
187
"""Clear all cached storage instances."""
209
- self ._cache_by_storage_client = defaultdict ( _StorageClientCache )
188
+ self ._cache = _StorageCache ( )
0 commit comments