11from __future__ import annotations
22
3+ from asyncio import Lock
34from collections import defaultdict
45from collections .abc import Coroutine , Hashable
56from dataclasses import dataclass , field
67from typing import TYPE_CHECKING , TypeVar
8+ from weakref import WeakValueDictionary
79
810from crawlee ._utils .raise_if_too_many_kwargs import raise_if_too_many_kwargs
911from crawlee .storage_clients ._base import DatasetClient , KeyValueStoreClient , RequestQueueClient
@@ -76,6 +78,7 @@ class StorageInstanceManager:
7678
7779 def __init__ (self ) -> None :
7880 self ._cache : _StorageCache = _StorageCache ()
81+ self ._opener_locks : WeakValueDictionary [tuple , Lock ] = WeakValueDictionary ()
7982
8083 async def open_storage_instance (
8184 self ,
@@ -120,62 +123,74 @@ async def open_storage_instance(
120123 alias = self ._DEFAULT_STORAGE_ALIAS
121124
122125 # Check cache
123- if id is not None and (cached_instance := self ._cache .by_id [cls ][id ].get (storage_client_cache_key )):
124- if isinstance (cached_instance , cls ):
125- return cached_instance
126- raise RuntimeError ('Cached instance type mismatch.' )
127-
128- if name is not None and (cached_instance := self ._cache .by_name [cls ][name ].get (storage_client_cache_key )):
129- if isinstance (cached_instance , cls ):
130- return cached_instance
131- raise RuntimeError ('Cached instance type mismatch.' )
132-
133- if alias is not None and (
134- cached_instance := self ._cache .by_alias [cls ][alias ].get (storage_client_cache_key )
126+ if cached_instance := self ._get_from_cache (
127+ cls ,
128+ id = id ,
129+ name = name ,
130+ alias = alias ,
131+ storage_client_cache_key = storage_client_cache_key ,
135132 ):
136- if isinstance (cached_instance , cls ):
137- return cached_instance
138- raise RuntimeError ('Cached instance type mismatch.' )
133+ return cached_instance
139134
140135 # Check for conflicts between named and alias storages
141- if alias and (self ._cache .by_name [cls ][alias ].get (storage_client_cache_key )):
142- raise ValueError (
143- f'Cannot create alias storage "{ alias } " because a named storage with the same name already exists. '
144- f'Use a different alias or drop the existing named storage first.'
145- )
146-
147- if name and (self ._cache .by_alias [cls ][name ].get (storage_client_cache_key )):
148- raise ValueError (
149- f'Cannot create named storage "{ name } " because an alias storage with the same name already exists. '
150- f'Use a different name or drop the existing alias storage first.'
151- )
136+ self ._check_name_alias_conflict (
137+ cls ,
138+ name = name ,
139+ alias = alias ,
140+ storage_client_cache_key = storage_client_cache_key ,
141+ )
152142
153143 # Validate storage name
154144 if name is not None :
155145 validate_storage_name (name )
156146
157- # Create new instance
158- client : KeyValueStoreClient | DatasetClient | RequestQueueClient
159- client = await client_opener_coro
147+ # Acquire lock for this opener
148+ opener_lock_key = (cls , str (id or name or alias ), storage_client_cache_key )
149+ if not (lock := self ._opener_locks .get (opener_lock_key )):
150+ lock = Lock ()
151+ self ._opener_locks [opener_lock_key ] = lock
152+
153+ async with lock :
154+ # Re-check cache inside the lock
155+ if cached_instance := self ._get_from_cache (
156+ cls ,
157+ id = id ,
158+ name = name ,
159+ alias = alias ,
160+ storage_client_cache_key = storage_client_cache_key ,
161+ ):
162+ return cached_instance
160163
161- metadata = await client .get_metadata ()
164+ # Re-check for conflicts between named and alias storages
165+ self ._check_name_alias_conflict (
166+ cls ,
167+ name = name ,
168+ alias = alias ,
169+ storage_client_cache_key = storage_client_cache_key ,
170+ )
162171
163- instance = cls (client , metadata .id , metadata .name ) # type: ignore[call-arg]
164- instance_name = getattr (instance , 'name' , None )
172+ # Create new instance
173+ client : KeyValueStoreClient | DatasetClient | RequestQueueClient
174+ client = await client_opener_coro
165175
166- # Cache the instance.
167- # Always cache by id.
168- self ._cache .by_id [cls ][instance .id ][storage_client_cache_key ] = instance
176+ metadata = await client .get_metadata ()
169177
170- # Cache named storage.
171- if instance_name is not None :
172- self ._cache .by_name [cls ][instance_name ][storage_client_cache_key ] = instance
178+ instance = cls (client , metadata .id , metadata .name ) # type: ignore[call-arg]
179+ instance_name = getattr (instance , 'name' , None )
173180
174- # Cache unnamed storage .
175- if alias is not None :
176- self ._cache .by_alias [cls ][alias ][storage_client_cache_key ] = instance
181+ # Cache the instance .
182+ # Always cache by id.
183+ self ._cache .by_id [cls ][instance . id ][storage_client_cache_key ] = instance
177184
178- return instance
185+ # Cache named storage.
186+ if instance_name is not None :
187+ self ._cache .by_name [cls ][instance_name ][storage_client_cache_key ] = instance
188+
189+ # Cache unnamed storage.
190+ if alias is not None :
191+ self ._cache .by_alias [cls ][alias ][storage_client_cache_key ] = instance
192+
193+ return instance
179194
180195 finally :
181196 # Make sure the client opener is closed.
@@ -193,3 +208,51 @@ def remove_from_cache(self, storage_instance: Storage) -> None:
193208 def clear_cache (self ) -> None :
194209 """Clear all cached storage instances."""
195210 self ._cache = _StorageCache ()
211+
212+ def _get_from_cache (
213+ self ,
214+ cls : type [T ],
215+ * ,
216+ id : str | None = None ,
217+ name : str | None = None ,
218+ alias : str | None = None ,
219+ storage_client_cache_key : Hashable = '' ,
220+ ) -> T | None :
221+ """Get a storage instance from the cache."""
222+ if id is not None and (cached_instance := self ._cache .by_id [cls ][id ].get (storage_client_cache_key )):
223+ if isinstance (cached_instance , cls ):
224+ return cached_instance
225+ raise RuntimeError ('Cached instance type mismatch.' )
226+
227+ if name is not None and (cached_instance := self ._cache .by_name [cls ][name ].get (storage_client_cache_key )):
228+ if isinstance (cached_instance , cls ):
229+ return cached_instance
230+ raise RuntimeError ('Cached instance type mismatch.' )
231+
232+ if alias is not None and (cached_instance := self ._cache .by_alias [cls ][alias ].get (storage_client_cache_key )):
233+ if isinstance (cached_instance , cls ):
234+ return cached_instance
235+ raise RuntimeError ('Cached instance type mismatch.' )
236+
237+ return None
238+
239+ def _check_name_alias_conflict (
240+ self ,
241+ cls : type [T ],
242+ * ,
243+ name : str | None = None ,
244+ alias : str | None = None ,
245+ storage_client_cache_key : Hashable = '' ,
246+ ) -> None :
247+ """Check for conflicts between named and alias storages."""
248+ if alias and (self ._cache .by_name [cls ][alias ].get (storage_client_cache_key )):
249+ raise ValueError (
250+ f'Cannot create alias storage "{ alias } " because a named storage with the same name already exists. '
251+ f'Use a different alias or drop the existing named storage first.'
252+ )
253+
254+ if name and (self ._cache .by_alias [cls ][name ].get (storage_client_cache_key )):
255+ raise ValueError (
256+ f'Cannot create named storage "{ name } " because an alias storage with the same name already exists. '
257+ f'Use a different name or drop the existing alias storage first.'
258+ )
0 commit comments