Skip to content

Commit 23ac4bc

Browse files
committed
Update storage clients
1 parent fecae5c commit 23ac4bc

File tree

9 files changed

+165
-44
lines changed

9 files changed

+165
-44
lines changed

src/crawlee/_service_locator.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
from crawlee._utils.docs import docs_group
44
from crawlee.configuration import Configuration
55
from crawlee.errors import ServiceConflictError
6-
from crawlee.events import EventManager
7-
from crawlee.storage_clients import StorageClient
6+
from crawlee.events import EventManager, LocalEventManager
7+
from crawlee.storage_clients import FileSystemStorageClient, StorageClient
88

99

1010
@docs_group('Classes')
@@ -49,8 +49,6 @@ def set_configuration(self, configuration: Configuration) -> None:
4949
def get_event_manager(self) -> EventManager:
5050
"""Get the event manager."""
5151
if self._event_manager is None:
52-
from crawlee.events import LocalEventManager
53-
5452
self._event_manager = (
5553
LocalEventManager().from_config(config=self._configuration)
5654
if self._configuration
@@ -77,9 +75,7 @@ def set_event_manager(self, event_manager: EventManager) -> None:
7775
def get_storage_client(self) -> StorageClient:
7876
"""Get the storage client."""
7977
if self._storage_client is None:
80-
from crawlee.storage_clients import file_system_storage_client
81-
82-
self._storage_client = file_system_storage_client
78+
self._storage_client = FileSystemStorageClient()
8379

8480
self._storage_client_was_retrieved = True
8581
return self._storage_client
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from ._base import StorageClient
2-
from ._file_system import file_system_storage_client
3-
from ._memory import memory_storage_client
2+
from ._file_system import FileSystemStorageClient
3+
from ._memory import MemoryStorageClient
44

55
__all__ = [
6+
'FileSystemStorageClient',
7+
'MemoryStorageClient',
68
'StorageClient',
7-
'file_system_storage_client',
8-
'memory_storage_client'
99
]
Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,48 @@
11
from __future__ import annotations
22

3-
from dataclasses import dataclass
3+
from abc import ABC, abstractmethod
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7+
from pathlib import Path
8+
79
from ._dataset_client import DatasetClient
810
from ._key_value_store_client import KeyValueStoreClient
911
from ._request_queue_client import RequestQueueClient
1012

1113

12-
@dataclass
13-
class StorageClient:
14-
dataset_client_class: type[DatasetClient]
15-
key_value_store_client_class: type[KeyValueStoreClient]
16-
request_queue_client_class: type[RequestQueueClient]
14+
class StorageClient(ABC):
15+
"""Base class for storage clients."""
16+
17+
@abstractmethod
18+
async def open_dataset_client(
19+
self,
20+
*,
21+
id: str | None,
22+
name: str | None,
23+
purge_on_start: bool,
24+
storage_dir: Path,
25+
) -> DatasetClient:
26+
"""Open the dataset client."""
27+
28+
@abstractmethod
29+
async def open_key_value_store_client(
30+
self,
31+
*,
32+
id: str | None,
33+
name: str | None,
34+
purge_on_start: bool,
35+
storage_dir: Path,
36+
) -> KeyValueStoreClient:
37+
"""Open the key-value store client."""
38+
39+
@abstractmethod
40+
async def open_request_queue_client(
41+
self,
42+
*,
43+
id: str | None,
44+
name: str | None,
45+
purge_on_start: bool,
46+
storage_dir: Path,
47+
) -> RequestQueueClient:
48+
"""Open the request queue client."""
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from ._storage_client import file_system_storage_client
1+
from ._storage_client import FileSystemStorageClient
22

3-
__all__ = ['file_system_storage_client']
3+
__all__ = ['FileSystemStorageClient']
Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,65 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
5+
from typing_extensions import override
6+
37
from crawlee.storage_clients._base import StorageClient
48

59
from ._dataset_client import FileSystemDatasetClient
610
from ._key_value_store import FileSystemKeyValueStoreClient
711
from ._request_queue import FileSystemRequestQueueClient
812

9-
file_system_storage_client = StorageClient(
10-
dataset_client_class=FileSystemDatasetClient,
11-
key_value_store_client_class=FileSystemKeyValueStoreClient,
12-
request_queue_client_class=FileSystemRequestQueueClient,
13-
)
13+
if TYPE_CHECKING:
14+
from pathlib import Path
15+
16+
17+
class FileSystemStorageClient(StorageClient):
18+
"""File system storage client."""
19+
20+
@override
21+
async def open_dataset_client(
22+
self,
23+
*,
24+
id: str | None,
25+
name: str | None,
26+
purge_on_start: bool,
27+
storage_dir: Path,
28+
) -> FileSystemDatasetClient:
29+
dataset_client = await FileSystemDatasetClient.open(
30+
id=id,
31+
name=name,
32+
storage_dir=storage_dir,
33+
)
34+
35+
if purge_on_start:
36+
await dataset_client.drop()
37+
dataset_client = await FileSystemDatasetClient.open(
38+
id=id,
39+
name=name,
40+
storage_dir=storage_dir,
41+
)
42+
43+
return dataset_client
44+
45+
@override
46+
async def open_key_value_store_client(
47+
self,
48+
*,
49+
id: str | None,
50+
name: str | None,
51+
purge_on_start: bool,
52+
storage_dir: Path,
53+
) -> FileSystemKeyValueStoreClient:
54+
return FileSystemKeyValueStoreClient()
55+
56+
@override
57+
async def open_request_queue_client(
58+
self,
59+
*,
60+
id: str | None,
61+
name: str | None,
62+
purge_on_start: bool,
63+
storage_dir: Path,
64+
) -> FileSystemRequestQueueClient:
65+
return FileSystemRequestQueueClient()
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from ._storage_client import memory_storage_client
1+
from ._storage_client import MemoryStorageClient
22

3-
__all__ = ['memory_storage_client']
3+
__all__ = ['MemoryStorageClient']
Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,57 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
5+
from typing_extensions import override
6+
37
from crawlee.storage_clients._base import StorageClient
48

59
from ._dataset_client import MemoryDatasetClient
610
from ._key_value_store_client import MemoryKeyValueStoreClient
711
from ._request_queue_client import MemoryRequestQueueClient
812

9-
memory_storage_client = StorageClient(
10-
dataset_client_class=MemoryDatasetClient,
11-
key_value_store_client_class=MemoryKeyValueStoreClient,
12-
request_queue_client_class=MemoryRequestQueueClient,
13-
)
13+
if TYPE_CHECKING:
14+
from pathlib import Path
15+
16+
17+
class MemoryStorageClient(StorageClient):
18+
"""Memory storage client."""
19+
20+
@override
21+
async def open_dataset_client(
22+
self,
23+
*,
24+
id: str | None,
25+
name: str | None,
26+
purge_on_start: bool,
27+
storage_dir: Path,
28+
) -> MemoryDatasetClient:
29+
dataset_client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
30+
31+
if purge_on_start:
32+
await dataset_client.drop()
33+
dataset_client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
34+
35+
return dataset_client
36+
37+
@override
38+
async def open_key_value_store_client(
39+
self,
40+
*,
41+
id: str | None,
42+
name: str | None,
43+
purge_on_start: bool,
44+
storage_dir: Path,
45+
) -> MemoryKeyValueStoreClient:
46+
return MemoryKeyValueStoreClient()
47+
48+
@override
49+
async def open_request_queue_client(
50+
self,
51+
*,
52+
id: str | None,
53+
name: str | None,
54+
purge_on_start: bool,
55+
storage_dir: Path,
56+
) -> MemoryRequestQueueClient:
57+
return MemoryRequestQueueClient()

src/crawlee/storages/_dataset.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
# TODO:
3232
# - inherit from storage class
3333
# - export methods
34+
# - caching / memoization of both datasets & dataset clients
3435

3536
# Dataset
3637
# - properties:
@@ -116,30 +117,25 @@ async def open(
116117
id: str | None = None,
117118
name: str | None = None,
118119
purge_on_start: bool | None = None,
120+
storage_dir: Path | None = None,
119121
configuration: Configuration | None = None,
120122
storage_client: StorageClient | None = None,
121123
) -> Dataset:
122124
if id and name:
123125
raise ValueError('Only one of "id" or "name" can be specified, not both.')
124126

125-
configuration = configuration or service_locator.get_configuration()
126-
storage_client = storage_client or service_locator.get_storage_client()
127+
configuration = service_locator.get_configuration() if configuration is None else configuration
128+
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
127129
purge_on_start = configuration.purge_on_start if purge_on_start is None else purge_on_start
130+
storage_dir = Path(configuration.storage_dir) if storage_dir is None else storage_dir
128131

129-
dataset_client = await storage_client.dataset_client_class.open(
132+
dataset_client = await storage_client.open_dataset_client(
130133
id=id,
131134
name=name,
132-
storage_dir=Path(configuration.storage_dir),
135+
purge_on_start=purge_on_start,
136+
storage_dir=storage_dir,
133137
)
134138

135-
if purge_on_start:
136-
await dataset_client.drop()
137-
dataset_client = await storage_client.dataset_client_class.open(
138-
id=id,
139-
name=name,
140-
storage_dir=Path(configuration.storage_dir),
141-
)
142-
143139
return cls(dataset_client)
144140

145141
async def drop(self) -> None:

website/generate_module_shortcuts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import importlib
66
import inspect
77
import json
8+
from pathlib import Path
89
from typing import TYPE_CHECKING
910

1011
if TYPE_CHECKING:
@@ -55,5 +56,5 @@ def resolve_shortcuts(shortcuts: dict) -> None:
5556

5657
resolve_shortcuts(shortcuts)
5758

58-
with open('module_shortcuts.json', 'w', encoding='utf-8') as shortcuts_file:
59+
with Path('module_shortcuts.json').open('w', encoding='utf-8') as shortcuts_file:
5960
json.dump(shortcuts, shortcuts_file, indent=4, sort_keys=True)

0 commit comments

Comments
 (0)