Skip to content

Commit 59d81d8

Browse files
committed
purge only unnamed storages
1 parent ab9e9a4 commit 59d81d8

File tree

12 files changed

+42
-39
lines changed

12 files changed

+42
-39
lines changed

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,17 @@ class FileSystemDatasetClient(DatasetClient):
4747
_STORAGE_SUBDIR = 'datasets'
4848
"""The name of the subdirectory where datasets are stored."""
4949

50+
_STORAGE_SUBSUBDIR_DEFAULT = 'default'
51+
"""The name of the subdirectory for the default dataset."""
52+
5053
_ITEM_FILENAME_DIGITS = 9
5154
"""Number of digits used for the dataset item file names (e.g., 000000019.json)."""
5255

5356
def __init__(
5457
self,
5558
*,
5659
id: str,
57-
name: str,
60+
name: str | None,
5861
created_at: datetime,
5962
accessed_at: datetime,
6063
modified_at: datetime,
@@ -88,6 +91,9 @@ def metadata(self) -> DatasetMetadata:
8891
@property
8992
def path_to_dataset(self) -> Path:
9093
"""The full path to the dataset directory."""
94+
if self.metadata.name is None:
95+
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
96+
9197
return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name
9298

9399
@property
@@ -149,8 +155,9 @@ async def open(
149155

150156
# Get a new instance by name.
151157
else:
152-
name = name or configuration.default_dataset_id
153-
dataset_path = dataset_base_path / name
158+
dataset_path = (
159+
dataset_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else dataset_base_path / name
160+
)
154161
metadata_path = dataset_path / METADATA_FILENAME
155162

156163
# If the dataset directory exists, reconstruct the client from the metadata file.

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,14 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
4949
_STORAGE_SUBDIR = 'key_value_stores'
5050
"""The name of the subdirectory where key-value stores are stored."""
5151

52+
_STORAGE_SUBSUBDIR_DEFAULT = 'default'
53+
"""The name of the subdirectory for the default key-value store."""
54+
5255
def __init__(
5356
self,
5457
*,
5558
id: str,
56-
name: str,
59+
name: str | None,
5760
created_at: datetime,
5861
accessed_at: datetime,
5962
modified_at: datetime,
@@ -85,6 +88,9 @@ def metadata(self) -> KeyValueStoreMetadata:
8588
@property
8689
def path_to_kvs(self) -> Path:
8790
"""The full path to the key-value store directory."""
91+
if self.metadata.name is None:
92+
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
93+
8894
return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name
8995

9096
@property
@@ -145,9 +151,9 @@ async def open(
145151

146152
# Get a new instance by name.
147153
else:
148-
name = name or configuration.default_key_value_store_id
149-
150-
kvs_path = storage_dir / cls._STORAGE_SUBDIR / name
154+
kvs_path = (
155+
kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name
156+
)
151157
metadata_path = kvs_path / METADATA_FILENAME
152158

153159
# If the key-value store directory exists, reconstruct the client from the metadata file.

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,14 @@ class FileSystemRequestQueueClient(RequestQueueClient):
4848
_STORAGE_SUBDIR = 'request_queues'
4949
"""The name of the subdirectory where request queues are stored."""
5050

51+
_STORAGE_SUBSUBDIR_DEFAULT = 'default'
52+
"""The name of the subdirectory for the default request queue."""
53+
5154
def __init__(
5255
self,
5356
*,
5457
id: str,
55-
name: str,
58+
name: str | None,
5659
created_at: datetime,
5760
accessed_at: datetime,
5861
modified_at: datetime,
@@ -100,6 +103,9 @@ def metadata(self) -> RequestQueueMetadata:
100103
@property
101104
def path_to_rq(self) -> Path:
102105
"""The full path to the request queue directory."""
106+
if self.metadata.name is None:
107+
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
108+
103109
return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name
104110

105111
@property
@@ -165,9 +171,9 @@ async def open(
165171

166172
# Get a new instance by name.
167173
else:
168-
name = name or configuration.default_request_queue_id
169-
170-
rq_path = storage_dir / cls._STORAGE_SUBDIR / name
174+
rq_path = (
175+
rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name
176+
)
171177
metadata_path = rq_path / METADATA_FILENAME
172178

173179
# If the RQ directory exists, reconstruct the client from the metadata file.

src/crawlee/storage_clients/_file_system/_storage_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async def open_dataset_client(
2424
configuration = configuration or Configuration.get_global_configuration()
2525
client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
2626

27-
if configuration.purge_on_start:
27+
if configuration.purge_on_start and client.metadata.name is None:
2828
await client.purge()
2929

3030
return client
@@ -40,7 +40,7 @@ async def open_key_value_store_client(
4040
configuration = configuration or Configuration.get_global_configuration()
4141
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
4242

43-
if configuration.purge_on_start:
43+
if configuration.purge_on_start and client.metadata.name is None:
4444
await client.purge()
4545

4646
return client
@@ -56,7 +56,7 @@ async def open_request_queue_client(
5656
configuration = configuration or Configuration.get_global_configuration()
5757
client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
5858

59-
if configuration.purge_on_start:
59+
if configuration.purge_on_start and client.metadata.name is None:
6060
await client.purge()
6161

6262
return client

src/crawlee/storage_clients/_memory/_dataset_client.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
self,
3636
*,
3737
id: str,
38-
name: str,
38+
name: str | None,
3939
created_at: datetime,
4040
accessed_at: datetime,
4141
modified_at: datetime,
@@ -71,8 +71,6 @@ async def open(
7171
name: str | None,
7272
configuration: Configuration,
7373
) -> MemoryDatasetClient:
74-
name = name or configuration.default_dataset_id
75-
7674
# Otherwise create a new dataset
7775
dataset_id = id or crypto_random_object_id()
7876
now = datetime.now(timezone.utc)

src/crawlee/storage_clients/_memory/_key_value_store_client.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636
self,
3737
*,
3838
id: str,
39-
name: str,
39+
name: str | None,
4040
created_at: datetime,
4141
accessed_at: datetime,
4242
modified_at: datetime,
@@ -70,8 +70,6 @@ async def open(
7070
name: str | None,
7171
configuration: Configuration,
7272
) -> MemoryKeyValueStoreClient:
73-
name = name or configuration.default_key_value_store_id
74-
7573
# Otherwise create a new key-value store
7674
store_id = id or crypto_random_object_id()
7775
now = datetime.now(timezone.utc)

src/crawlee/storage_clients/_memory/_request_queue_client.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def __init__(
3939
self,
4040
*,
4141
id: str,
42-
name: str,
42+
name: str | None,
4343
created_at: datetime,
4444
accessed_at: datetime,
4545
modified_at: datetime,
@@ -86,8 +86,6 @@ async def open(
8686
name: str | None,
8787
configuration: Configuration,
8888
) -> MemoryRequestQueueClient:
89-
name = name or configuration.default_request_queue_id
90-
9189
# Otherwise create a new queue
9290
queue_id = id or crypto_random_object_id()
9391
now = datetime.now(timezone.utc)

src/crawlee/storage_clients/_memory/_storage_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async def open_dataset_client(
2424
configuration = configuration or Configuration.get_global_configuration()
2525
client = await MemoryDatasetClient.open(id=id, name=name, configuration=configuration)
2626

27-
if configuration.purge_on_start:
27+
if configuration.purge_on_start and client.metadata.name is None:
2828
await client.purge()
2929

3030
return client
@@ -40,7 +40,7 @@ async def open_key_value_store_client(
4040
configuration = configuration or Configuration.get_global_configuration()
4141
client = await MemoryKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
4242

43-
if configuration.purge_on_start:
43+
if configuration.purge_on_start and client.metadata.name is None:
4444
await client.purge()
4545

4646
return client
@@ -56,7 +56,7 @@ async def open_request_queue_client(
5656
configuration = configuration or Configuration.get_global_configuration()
5757
client = await MemoryRequestQueueClient.open(id=id, name=name, configuration=configuration)
5858

59-
if configuration.purge_on_start:
59+
if configuration.purge_on_start and client.metadata.name is None:
6060
await client.purge()
6161

6262
return client

src/crawlee/storage_clients/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class StorageMetadata(BaseModel):
2828
id: Annotated[str, Field(alias='id')]
2929
"""The unique identifier of the storage."""
3030

31-
name: Annotated[str, Field(alias='name', default='default')]
31+
name: Annotated[str | None, Field(alias='name', default=None)]
3232
"""The name of the storage."""
3333

3434
accessed_at: Annotated[datetime, Field(alias='accessedAt')]

tests/unit/storage_clients/_file_system/test_fs_dataset_client.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ async def test_dataset_client_purge_on_start(configuration: Configuration) -> No
7070

7171
# Create dataset and add data
7272
dataset_client1 = await FileSystemStorageClient().open_dataset_client(
73-
name='test-purge-dataset',
7473
configuration=configuration,
7574
)
7675
await dataset_client1.push_data({'item': 'initial data'})
@@ -81,7 +80,6 @@ async def test_dataset_client_purge_on_start(configuration: Configuration) -> No
8180

8281
# Reopen
8382
dataset_client2 = await FileSystemStorageClient().open_dataset_client(
84-
name='test-purge-dataset',
8583
configuration=configuration,
8684
)
8785

0 commit comments

Comments
 (0)