Skip to content

Commit 638756f

Browse files
committed
Add Apify's version of FS client that keeps the INPUT json
1 parent e5b2bc4 commit 638756f

File tree

6 files changed

+131
-1
lines changed

6 files changed

+131
-1
lines changed

src/apify/storage_clients/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient
1+
from crawlee.storage_clients import MemoryStorageClient
22

33
from ._apify import ApifyStorageClient
4+
from ._file_system import FileSystemStorageClient
45

56
__all__ = [
67
'ApifyStorageClient',
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from ._storage_client import ApifyFileSystemStorageClient as FileSystemStorageClient
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
3+
from typing_extensions import override
4+
5+
from crawlee._consts import METADATA_FILENAME
6+
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
7+
8+
from apify._configuration import Configuration
9+
10+
11+
class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
12+
"""Apify-specific implementation of the `FileSystemKeyValueStoreClient`.
13+
14+
The only difference is that it overrides the `purge` method to delete all files in the key-value store
15+
directory, except for the metadata file and the `INPUT.json` file.
16+
"""
17+
18+
@override
19+
async def purge(self) -> None:
20+
"""Purges the key-value store by deleting all its contents.
21+
22+
It deletes all files in the key-value store directory, except for the metadata file and
23+
the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
24+
"""
25+
kvs_input_key = Configuration.get_global_configuration().input_key
26+
async with self._lock:
27+
for file_path in self.path_to_kvs.glob('*'):
28+
if file_path.name in {METADATA_FILENAME, f'{kvs_input_key}.json'}:
29+
continue
30+
if file_path.is_file():
31+
await asyncio.to_thread(file_path.unlink, missing_ok=True)
32+
33+
await self._update_metadata(
34+
update_accessed_at=True,
35+
update_modified_at=True,
36+
)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from typing_extensions import override
6+
7+
from crawlee._utils.docs import docs_group
8+
from crawlee.configuration import Configuration
9+
from crawlee.storage_clients import FileSystemStorageClient
10+
11+
from ._key_value_store_client import ApifyFileSystemKeyValueStoreClient
12+
13+
if TYPE_CHECKING:
14+
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
15+
16+
17+
@docs_group('Classes')
18+
class ApifyFileSystemStorageClient(FileSystemStorageClient):
19+
"""Apify-specific implementation of the file system storage client.
20+
21+
The only difference is that it uses `ApifyFileSystemKeyValueStoreClient` for key-value stores,
22+
which overrides the `purge` method to delete all files in the key-value store directory
23+
except for the metadata file and the `INPUT.json` file.
24+
"""
25+
26+
@override
27+
async def create_kvs_client(
28+
self,
29+
*,
30+
id: str | None = None,
31+
name: str | None = None,
32+
configuration: Configuration | None = None,
33+
) -> FileSystemKeyValueStoreClient:
34+
configuration = configuration or Configuration.get_global_configuration()
35+
client = await ApifyFileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
36+
await self._purge_if_needed(client, configuration)
37+
return client

tests/unit/storage_clients/__init__.py

Whitespace-only changes.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
5+
from crawlee._consts import METADATA_FILENAME
6+
7+
from apify import Configuration
8+
from apify.storage_clients._file_system._key_value_store_client import ApifyFileSystemKeyValueStoreClient
9+
10+
11+
async def test_purge_preserves_input_file_and_metadata() -> None:
12+
"""Test that purge() preserves INPUT.json and metadata files but removes other files."""
13+
# Get the global configuration (storage directory is set by test fixtures)
14+
config = Configuration.get_global_configuration()
15+
16+
# Create the key-value store client
17+
kvs_client = await ApifyFileSystemKeyValueStoreClient.open(
18+
id=None,
19+
name='test-kvs',
20+
configuration=config,
21+
)
22+
23+
# Create some test files in the KVS directory
24+
kvs_path = kvs_client.path_to_kvs
25+
26+
# Create various files
27+
kvs_input_filename = f'{config.input_key}.json'
28+
input_file = kvs_path / kvs_input_filename
29+
metadata_file = kvs_path / METADATA_FILENAME
30+
regular_file1 = kvs_path / 'regular_file1.json'
31+
regular_file2 = kvs_path / 'another_file.txt'
32+
33+
# Write content to files
34+
await asyncio.to_thread(input_file.write_text, '{"test": "input"}')
35+
await asyncio.to_thread(regular_file1.write_text, '{"test": "data1"}')
36+
await asyncio.to_thread(regular_file2.write_text, 'some text content')
37+
38+
# Verify all files exist before purge
39+
assert input_file.exists()
40+
assert metadata_file.exists() # Should exist from client creation
41+
assert regular_file1.exists()
42+
assert regular_file2.exists()
43+
44+
# Purge the key-value store
45+
await kvs_client.purge() # Verify INPUT.json and metadata are preserved
46+
assert input_file.exists(), f'{kvs_input_filename} should be preserved during purge'
47+
assert metadata_file.exists(), f'{METADATA_FILENAME} should be preserved during purge'
48+
49+
# Verify other files are deleted
50+
assert not regular_file1.exists(), 'Regular files should be deleted during purge'
51+
assert not regular_file2.exists(), 'Regular files should be deleted during purge'
52+
53+
# Verify INPUT.json content is unchanged
54+
input_content = await asyncio.to_thread(input_file.read_text)
55+
assert input_content == '{"test": "input"}'

0 commit comments

Comments
 (0)