Skip to content

Commit 119a108

Browse files
committed
Storage clients (entrypoints) and their tests
1 parent aa22464 commit 119a108

File tree

5 files changed

+315
-49
lines changed

5 files changed

+315
-49
lines changed

src/crawlee/storage_clients/_base/_storage_client.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,31 +18,31 @@ class StorageClient(ABC):
1818
async def open_dataset_client(
1919
self,
2020
*,
21-
id: str | None,
22-
name: str | None,
23-
purge_on_start: bool,
24-
storage_dir: Path,
21+
id: str | None = None,
22+
name: str | None = None,
23+
purge_on_start: bool = True,
24+
storage_dir: Path | None = None,
2525
) -> DatasetClient:
2626
"""Open the dataset client."""
2727

2828
@abstractmethod
2929
async def open_key_value_store_client(
3030
self,
3131
*,
32-
id: str | None,
33-
name: str | None,
34-
purge_on_start: bool,
35-
storage_dir: Path,
32+
id: str | None = None,
33+
name: str | None = None,
34+
purge_on_start: bool = True,
35+
storage_dir: Path | None = None,
3636
) -> KeyValueStoreClient:
3737
"""Open the key-value store client."""
3838

3939
@abstractmethod
4040
async def open_request_queue_client(
4141
self,
4242
*,
43-
id: str | None,
44-
name: str | None,
45-
purge_on_start: bool,
46-
storage_dir: Path,
43+
id: str | None = None,
44+
name: str | None = None,
45+
purge_on_start: bool = True,
46+
storage_dir: Path | None = None,
4747
) -> RequestQueueClient:
4848
"""Open the request queue client."""

src/crawlee/storage_clients/_file_system/_storage_client.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ class FileSystemStorageClient(StorageClient):
2121
async def open_dataset_client(
2222
self,
2323
*,
24-
id: str | None,
25-
name: str | None,
26-
purge_on_start: bool,
27-
storage_dir: Path,
24+
id: str | None = None,
25+
name: str | None = None,
26+
purge_on_start: bool = True,
27+
storage_dir: Path | None = None,
2828
) -> FileSystemDatasetClient:
2929
client = await FileSystemDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
3030

@@ -38,10 +38,10 @@ async def open_dataset_client(
3838
async def open_key_value_store_client(
3939
self,
4040
*,
41-
id: str | None,
42-
name: str | None,
43-
purge_on_start: bool,
44-
storage_dir: Path,
41+
id: str | None = None,
42+
name: str | None = None,
43+
purge_on_start: bool = True,
44+
storage_dir: Path | None = None,
4545
) -> FileSystemKeyValueStoreClient:
4646
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, storage_dir=storage_dir)
4747

@@ -55,15 +55,9 @@ async def open_key_value_store_client(
5555
async def open_request_queue_client(
5656
self,
5757
*,
58-
id: str | None,
59-
name: str | None,
60-
purge_on_start: bool,
61-
storage_dir: Path,
58+
id: str | None = None,
59+
name: str | None = None,
60+
purge_on_start: bool = True,
61+
storage_dir: Path | None = None,
6262
) -> FileSystemRequestQueueClient:
63-
client = await FileSystemRequestQueueClient.open(id=id, name=name, storage_dir=storage_dir)
64-
65-
if purge_on_start:
66-
await client.drop()
67-
client = await FileSystemRequestQueueClient.open(id=id, name=name, storage_dir=storage_dir)
68-
69-
return client
63+
pass

src/crawlee/storage_clients/_memory/_storage_client.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,37 +21,43 @@ class MemoryStorageClient(StorageClient):
2121
async def open_dataset_client(
2222
self,
2323
*,
24-
id: str | None,
25-
name: str | None,
26-
purge_on_start: bool,
27-
storage_dir: Path,
24+
id: str | None = None,
25+
name: str | None = None,
26+
purge_on_start: bool = True,
27+
storage_dir: Path | None = None
2828
) -> MemoryDatasetClient:
29-
dataset_client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
29+
client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
3030

3131
if purge_on_start:
32-
await dataset_client.drop()
33-
dataset_client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
32+
await client.drop()
33+
client = await MemoryDatasetClient.open(id=id, name=name, storage_dir=storage_dir)
3434

35-
return dataset_client
35+
return client
3636

3737
@override
3838
async def open_key_value_store_client(
3939
self,
4040
*,
41-
id: str | None,
42-
name: str | None,
43-
purge_on_start: bool,
44-
storage_dir: Path,
41+
id: str | None = None,
42+
name: str | None = None,
43+
purge_on_start: bool = True,
44+
storage_dir: Path | None = None
4545
) -> MemoryKeyValueStoreClient:
46-
return MemoryKeyValueStoreClient()
46+
client = await MemoryKeyValueStoreClient.open(id=id, name=name, storage_dir=storage_dir)
47+
48+
if purge_on_start:
49+
await client.drop()
50+
client = await MemoryKeyValueStoreClient.open(id=id, name=name, storage_dir=storage_dir)
51+
52+
return client
4753

4854
@override
4955
async def open_request_queue_client(
5056
self,
5157
*,
52-
id: str | None,
53-
name: str | None,
54-
purge_on_start: bool,
55-
storage_dir: Path,
58+
id: str | None = None,
59+
name: str | None = None,
60+
purge_on_start: bool = True,
61+
storage_dir: Path | None = None
5662
) -> MemoryRequestQueueClient:
57-
return MemoryRequestQueueClient()
63+
pass
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
import pytest
6+
7+
from crawlee.storage_clients._file_system._dataset_client import FileSystemDatasetClient
8+
from crawlee.storage_clients._file_system._key_value_store_client import FileSystemKeyValueStoreClient
9+
from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient
10+
11+
if TYPE_CHECKING:
12+
from pathlib import Path
13+
14+
pytestmark = pytest.mark.only
15+
16+
17+
@pytest.fixture
18+
async def client() -> FileSystemStorageClient:
19+
return FileSystemStorageClient()
20+
21+
22+
async def test_open_dataset_client(client: FileSystemStorageClient, tmp_path: Path) -> None:
23+
"""Test that open_dataset_client creates a dataset client with correct type and properties."""
24+
dataset_client = await client.open_dataset_client(name='test-dataset', storage_dir=tmp_path)
25+
26+
# Verify correct client type and properties
27+
assert isinstance(dataset_client, FileSystemDatasetClient)
28+
assert dataset_client.name == 'test-dataset'
29+
30+
# Verify directory structure was created
31+
assert dataset_client.path_to_dataset.exists()
32+
33+
34+
async def test_dataset_client_purge_on_start(client: FileSystemStorageClient, tmp_path: Path) -> None:
35+
"""Test that purge_on_start=True clears existing data in the dataset."""
36+
# Create dataset and add data
37+
dataset_client1 = await client.open_dataset_client(
38+
name='test-purge-dataset',
39+
storage_dir=tmp_path,
40+
purge_on_start=True,
41+
)
42+
await dataset_client1.push_data({'item': 'initial data'})
43+
44+
# Verify data was added
45+
items = await dataset_client1.get_data()
46+
assert len(items.items) == 1
47+
48+
# Reopen
49+
dataset_client2 = await client.open_dataset_client(
50+
name='test-purge-dataset',
51+
storage_dir=tmp_path,
52+
purge_on_start=True,
53+
)
54+
55+
# Verify data was purged
56+
items = await dataset_client2.get_data()
57+
assert len(items.items) == 0
58+
59+
60+
async def test_dataset_client_no_purge_on_start(client: FileSystemStorageClient, tmp_path: Path) -> None:
61+
"""Test that purge_on_start=False keeps existing data in the dataset."""
62+
# Create dataset and add data
63+
dataset_client1 = await client.open_dataset_client(
64+
name='test-no-purge-dataset',
65+
storage_dir=tmp_path,
66+
purge_on_start=False,
67+
)
68+
await dataset_client1.push_data({'item': 'preserved data'})
69+
70+
# Reopen
71+
dataset_client2 = await client.open_dataset_client(
72+
name='test-no-purge-dataset',
73+
storage_dir=tmp_path,
74+
purge_on_start=False,
75+
)
76+
77+
# Verify data was preserved
78+
items = await dataset_client2.get_data()
79+
assert len(items.items) == 1
80+
assert items.items[0]['item'] == 'preserved data'
81+
82+
83+
async def test_open_kvs_client(client: FileSystemStorageClient, tmp_path: Path) -> None:
84+
"""Test that open_key_value_store_client creates a KVS client with correct type and properties."""
85+
kvs_client = await client.open_key_value_store_client(name='test-kvs', storage_dir=tmp_path)
86+
87+
# Verify correct client type and properties
88+
assert isinstance(kvs_client, FileSystemKeyValueStoreClient)
89+
assert kvs_client.name == 'test-kvs'
90+
91+
# Verify directory structure was created
92+
assert kvs_client.path_to_kvs.exists()
93+
94+
95+
async def test_kvs_client_purge_on_start(client: FileSystemStorageClient, tmp_path: Path) -> None:
96+
"""Test that purge_on_start=True clears existing data in the key-value store."""
97+
# Create KVS and add data
98+
kvs_client1 = await client.open_key_value_store_client(
99+
name='test-purge-kvs',
100+
storage_dir=tmp_path,
101+
purge_on_start=True,
102+
)
103+
await kvs_client1.set_value(key='test-key', value='initial value')
104+
105+
# Verify value was set
106+
record = await kvs_client1.get_value(key='test-key')
107+
assert record is not None
108+
assert record.value == 'initial value'
109+
110+
# Reopen
111+
kvs_client2 = await client.open_key_value_store_client(
112+
name='test-purge-kvs',
113+
storage_dir=tmp_path,
114+
purge_on_start=True,
115+
)
116+
117+
# Verify value was purged
118+
record = await kvs_client2.get_value(key='test-key')
119+
assert record is None
120+
121+
122+
async def test_kvs_client_no_purge_on_start(client: FileSystemStorageClient, tmp_path: Path) -> None:
123+
"""Test that purge_on_start=False keeps existing data in the key-value store."""
124+
# Create KVS and add data
125+
kvs_client1 = await client.open_key_value_store_client(
126+
name='test-no-purge-kvs',
127+
storage_dir=tmp_path,
128+
purge_on_start=False,
129+
)
130+
await kvs_client1.set_value(key='test-key', value='preserved value')
131+
132+
# Reopen
133+
kvs_client2 = await client.open_key_value_store_client(
134+
name='test-no-purge-kvs',
135+
storage_dir=tmp_path,
136+
purge_on_start=False,
137+
)
138+
139+
# Verify value was preserved
140+
record = await kvs_client2.get_value(key='test-key')
141+
assert record is not None
142+
assert record.value == 'preserved value'

0 commit comments

Comments
 (0)