Skip to content

Commit fd62c22

Browse files
committed
Update the client, to work with Apify CLI
- no modifications to the input file - accepting various name forms - error, if multiple valid input files at the ssame time
1 parent d04cf6c commit fd62c22

File tree

2 files changed

+183
-41
lines changed

2 files changed

+183
-41
lines changed
Lines changed: 83 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
import asyncio
22
import json
33
import logging
4+
from itertools import chain
5+
from pathlib import Path
46

5-
from more_itertools import flatten
67
from typing_extensions import Self, override
78

89
from crawlee._consts import METADATA_FILENAME
10+
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
911
from crawlee.configuration import Configuration as CrawleeConfiguration
1012
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
11-
from crawlee.storage_clients.models import KeyValueStoreRecord
13+
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
1214

13-
from apify._configuration import Configuration
15+
from apify._configuration import Configuration as ApifyConfiguration
1416

1517
logger = logging.getLogger(__name__)
1618

@@ -22,6 +24,18 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
2224
directory, except for the metadata file and the `INPUT.json` file.
2325
"""
2426

27+
def __init__(
28+
self,
29+
*,
30+
metadata: KeyValueStoreMetadata,
31+
path_to_kvs: Path,
32+
lock: asyncio.Lock,
33+
) -> None:
34+
super().__init__(metadata=metadata, path_to_kvs=path_to_kvs, lock=lock)
35+
global_configuration = ApifyConfiguration.get_global_configuration()
36+
self._input_key = global_configuration.input_key
37+
self._input_key_filename = global_configuration.input_key
38+
2539
@override
2640
@classmethod
2741
async def open(
@@ -34,7 +48,18 @@ async def open(
3448
) -> Self:
3549
client = await super().open(id=id, name=name, alias=alias, configuration=configuration)
3650

37-
await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method
51+
if isinstance(configuration, ApifyConfiguration):
52+
client._input_key = configuration.input_key # noqa: SLF001 - it's okay, this is a factory method
53+
input_key_filename = cls._get_input_key_file_name(
54+
path_to_kvs=client.path_to_kvs, configuration=configuration
55+
)
56+
client._input_key_filename = input_key_filename # noqa: SLF001 - it's okay, this is a factory method
57+
input_file_path = client.path_to_kvs / input_key_filename
58+
input_file_metadata_path = client.path_to_kvs / f'{input_file_path}.{METADATA_FILENAME}'
59+
if input_file_path.exists() and not input_file_metadata_path.exists():
60+
await cls._create_missing_metadata_for_input_file(
61+
key=configuration.input_key, record_path=input_file_path
62+
)
3863

3964
return client
4065

@@ -43,14 +68,10 @@ async def purge(self) -> None:
4368
"""Purges the key-value store by deleting all its contents.
4469
4570
It deletes all files in the key-value store directory, except for the metadata file and
46-
the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
71+
the input related file and its metadata.
4772
"""
48-
configuration = Configuration.get_global_configuration()
49-
5073
async with self._lock:
51-
files_to_keep = set(
52-
flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
53-
)
74+
files_to_keep = {self._input_key_filename, f'{self._input_key_filename}.{METADATA_FILENAME}'}
5475
files_to_keep.add(METADATA_FILENAME)
5576

5677
for file_path in self.path_to_kvs.glob('*'):
@@ -64,40 +85,61 @@ async def purge(self) -> None:
6485
update_modified_at=True,
6586
)
6687

67-
async def _sanitize_input_json_files(self) -> None:
68-
"""Handle missing metadata for input files."""
69-
configuration = Configuration.get_global_configuration()
70-
alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key}
71-
72-
if (self.path_to_kvs / configuration.canonical_input_key).exists():
73-
# Refresh metadata to prevent inconsistencies
74-
input_data = await asyncio.to_thread(
75-
lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text())
76-
)
77-
await self.set_value(key=configuration.canonical_input_key, value=input_data)
88+
@override
89+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
90+
if key == self._input_key:
91+
# Potentially point to custom input file name instead
92+
key = self._input_key_filename
93+
return await super().get_value(key=key)
7894

79-
for alternative_key in alternative_keys:
80-
if (alternative_input_file := self.path_to_kvs / alternative_key).exists():
81-
logger.warning(f'Redundant input file found: {alternative_input_file}')
95+
@staticmethod
96+
async def _create_missing_metadata_for_input_file(key: str, record_path: Path) -> None:
97+
# Read the actual value
98+
try:
99+
content = await asyncio.to_thread(record_path.read_bytes)
100+
except FileNotFoundError:
101+
logger.warning(f'Input file disparaged on path: "{record_path}"')
102+
return
103+
104+
# Figure out the metadata from the file content
105+
size = len(content)
106+
if record_path.suffix == '.json':
107+
value = json.loads(content.decode('utf-8'))
108+
elif record_path.suffix == '.txt':
109+
value = content.decode('utf-8')
110+
elif record_path.suffix == '':
111+
try:
112+
value = json.loads(content.decode('utf-8'))
113+
except json.JSONDecodeError:
114+
value = content
82115
else:
83-
for alternative_key in alternative_keys:
84-
alternative_input_file = self.path_to_kvs / alternative_key
116+
value = content
85117

86-
# Only process files that actually exist
87-
if alternative_input_file.exists():
88-
# Refresh metadata to prevent inconsistencies
89-
with alternative_input_file.open() as f:
90-
input_data = await asyncio.to_thread(lambda: json.load(f))
91-
await self.set_value(key=alternative_key, value=input_data)
118+
content_type = infer_mime_type(value)
92119

93-
@override
94-
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
95-
configuration = Configuration.get_global_configuration()
120+
record_metadata = KeyValueStoreRecordMetadata(key=key, content_type=content_type, size=size)
121+
record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')
122+
record_metadata_content = await json_dumps(record_metadata.model_dump())
96123

97-
if key in configuration.input_key_candidates:
98-
for candidate in configuration.input_key_candidates:
99-
value = await super().get_value(key=candidate)
100-
if value is not None:
101-
return value
124+
# Write the record metadata to the file.
125+
await atomic_write(record_metadata_filepath, record_metadata_content)
102126

103-
return await super().get_value(key=key)
127+
@staticmethod
128+
def _get_input_key_file_name(path_to_kvs: Path, configuration: ApifyConfiguration) -> str:
129+
found_input_files = set()
130+
for file_path in chain(
131+
path_to_kvs.glob(f'{configuration.input_key}.*'), path_to_kvs.glob(f'{configuration.input_key}')
132+
):
133+
if file_path.suffix == f'.{METADATA_FILENAME}':
134+
# Ignore metadata files
135+
continue
136+
found_input_files.add(file_path.name)
137+
138+
if len(found_input_files) > 1:
139+
raise RuntimeError(f'Only one input file is allowed. Following input files found: {found_input_files}')
140+
141+
if len(found_input_files) == 1:
142+
return found_input_files.pop()
143+
144+
# No custom input file found, return the default input key
145+
return configuration.input_key

tests/unit/test_apify_storages.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
import asyncio
2+
import json
13
from datetime import datetime, timezone
4+
from pathlib import Path
25
from unittest import mock
36
from unittest.mock import AsyncMock
47

@@ -15,6 +18,10 @@
1518
from apify.storage_clients._file_system import ApifyFileSystemKeyValueStoreClient, ApifyFileSystemStorageClient
1619
from apify.storages import Dataset, KeyValueStore, RequestQueue
1720

21+
EXAMPLE_JSON_INPUT = json.dumps({'key': 'value'})
22+
EXAMPLE_TXT_INPUT = 'Best input ever'
23+
EXAMPLE_BYTES_INPUT = b'High quality bytes'
24+
1825

1926
@pytest.mark.parametrize(
2027
('storage', '_storage_client'),
@@ -92,3 +99,96 @@ async def test_first_filesystem_storage_client_wins() -> None:
9299

93100
assert kvs_3 is kvs_4
94101
assert type(kvs_4._client) is FileSystemKeyValueStoreClient
102+
103+
104+
@pytest.fixture(params=['INPUT', 'FOO'])
105+
def input_test_configuration(tmp_path: Path, request: pytest.FixtureRequest) -> Configuration:
106+
configuration = Configuration()
107+
configuration.input_key = request.param
108+
configuration.storage_dir = str(tmp_path)
109+
# Explicitly demand purge. Input file should survive this.
110+
configuration.purge_on_start = True
111+
112+
# Create custom key file without metadata in the KVS directory
113+
(tmp_path / 'key_value_stores' / 'default').mkdir(parents=True)
114+
return configuration
115+
116+
117+
async def test_multiple_input_file_formats_cause_error(input_test_configuration: Configuration) -> None:
118+
"""Test that having multiple input files causes an error, for example: `INPUT` and `INPUT.json`"""
119+
120+
# Create two input files in the KVS directory
121+
kvs_path = Path(input_test_configuration.storage_dir) / 'key_value_stores' / 'default'
122+
with open(kvs_path / f'{input_test_configuration.input_key}', 'wb') as f: # noqa: ASYNC230 # It is ok for a test to use sync I/O
123+
f.write(EXAMPLE_BYTES_INPUT)
124+
125+
with open(kvs_path / f'{input_test_configuration.input_key}.json', 'w', encoding='utf-8') as f: # noqa: ASYNC230 # It is ok for a test to use sync I/O
126+
f.write(EXAMPLE_JSON_INPUT)
127+
128+
with pytest.raises(RuntimeError, match=r'Only one input file is allowed. Following input files found: .*'):
129+
await KeyValueStore.open(
130+
storage_client=ApifyFileSystemStorageClient(),
131+
configuration=input_test_configuration,
132+
)
133+
134+
135+
async def test_txt_input_missing_metadata(input_test_configuration: Configuration) -> None:
136+
"""Test that files with missing metadata can be used, and metadata is recreated."""
137+
138+
# Create custom key file without metadata in the KVS directory
139+
kvs_path = Path(input_test_configuration.storage_dir) / 'key_value_stores' / 'default'
140+
input_file = kvs_path / f'{input_test_configuration.input_key}.txt'
141+
with open(input_file, 'w', encoding='utf-8') as f: # noqa: ASYNC230 # It is ok for a test to use sync I/O
142+
f.write(EXAMPLE_TXT_INPUT)
143+
last_modified = input_file.stat().st_mtime
144+
145+
# Make sure that filesystem has enough time to detect changes
146+
await asyncio.sleep(1)
147+
148+
kvs = await KeyValueStore.open(
149+
storage_client=ApifyFileSystemStorageClient(), configuration=input_test_configuration
150+
)
151+
assert await kvs.get_value(input_test_configuration.input_key) == EXAMPLE_TXT_INPUT
152+
assert last_modified == input_file.stat().st_mtime, 'File was modified or recreated.'
153+
154+
155+
@pytest.mark.parametrize('suffix', [('.json'), ('')])
156+
async def test_json_input_missing_metadata(input_test_configuration: Configuration, suffix: str) -> None:
157+
"""Test that files with missing metadata can be used, and metadata is recreated."""
158+
159+
# Create custom key file without metadata in the KVS directory
160+
kvs_path = Path(input_test_configuration.storage_dir) / 'key_value_stores' / 'default'
161+
input_file = kvs_path / f'{input_test_configuration.input_key}{suffix}'
162+
with open(input_file, 'w', encoding='utf-8') as f: # noqa: ASYNC230 # It is ok for a test to use sync I/O
163+
f.write(EXAMPLE_JSON_INPUT)
164+
last_modified = input_file.stat().st_mtime
165+
166+
# Make sure that filesystem has enough time to detect changes
167+
await asyncio.sleep(1)
168+
169+
kvs = await KeyValueStore.open(
170+
storage_client=ApifyFileSystemStorageClient(), configuration=input_test_configuration
171+
)
172+
assert json.loads(EXAMPLE_JSON_INPUT) == await kvs.get_value(input_test_configuration.input_key)
173+
assert last_modified == input_file.stat().st_mtime, 'File was modified or recreated.'
174+
175+
176+
@pytest.mark.parametrize('suffix', [('.bin'), (''), ('.whatever')])
177+
async def test_bytes_input_missing_metadata(input_test_configuration: Configuration, suffix: str) -> None:
178+
"""Test that files with missing metadata can be used, and metadata is recreated."""
179+
180+
# Create custom key file without metadata in the KVS directory
181+
kvs_path = Path(input_test_configuration.storage_dir) / 'key_value_stores' / 'default'
182+
input_file = kvs_path / f'{input_test_configuration.input_key}{suffix}'
183+
with open(input_file, 'wb') as f: # noqa: ASYNC230 # It is ok for a test to use sync I/O
184+
f.write(EXAMPLE_BYTES_INPUT)
185+
last_modified = input_file.stat().st_mtime
186+
187+
# Make sure that filesystem has enough time to detect changes
188+
await asyncio.sleep(1)
189+
190+
kvs = await KeyValueStore.open(
191+
storage_client=ApifyFileSystemStorageClient(), configuration=input_test_configuration
192+
)
193+
assert await kvs.get_value(input_test_configuration.input_key) == EXAMPLE_BYTES_INPUT
194+
assert last_modified == input_file.stat().st_mtime, 'File was modified or recreated.'

0 commit comments

Comments
 (0)