Skip to content

Commit dd94567

Browse files
committed
fix: Also load input from a file with a .json extension in file system storage
1 parent 78ba977 commit dd94567

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

src/apify/_configuration.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from datetime import datetime, timedelta
44
from decimal import Decimal
55
from logging import getLogger
6+
from pathlib import Path
67
from typing import Annotated, Any
78

89
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
@@ -421,6 +422,14 @@ def disable_browser_sandbox_on_platform(self) -> Self:
421422
logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
422423
return self
423424

425+
@property
426+
def canonical_input_key(self) -> str:
427+
return Path(self.input_key).stem
428+
429+
@property
430+
def input_key_candidates(self) -> set[str]:
431+
return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).with_suffix('.json').name}
432+
424433
@classmethod
425434
def get_global_configuration(cls) -> Configuration:
426435
"""Retrieve the global instance of the configuration.

src/apify/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import json
33
from pathlib import Path
44

5+
from more_itertools import flatten
56
from typing_extensions import override
67

78
from crawlee._consts import METADATA_FILENAME
89
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
10+
from crawlee.storage_clients.models import KeyValueStoreRecord
911

1012
from apify._configuration import Configuration
1113

@@ -24,16 +26,24 @@ async def purge(self) -> None:
2426
It deletes all files in the key-value store directory, except for the metadata file and
2527
the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
2628
"""
27-
kvs_input_key = Configuration.get_global_configuration().input_key
29+
configuration = Configuration.get_global_configuration()
2830

2931
# First try to find the alternative format of the input file and process it if it exists.
3032
for file_path in self.path_to_kvs.glob('*'):
31-
if file_path.name == f'{kvs_input_key}.json':
33+
if (
34+
file_path.name in configuration.input_key_candidates
35+
and file_path.name != configuration.canonical_input_key
36+
):
3237
await self._process_input_json(file_path)
3338

3439
async with self._lock:
40+
files_to_keep = set(
41+
flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
42+
)
43+
files_to_keep.add(METADATA_FILENAME)
44+
3545
for file_path in self.path_to_kvs.glob('*'):
36-
if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}:
46+
if file_path.name in files_to_keep:
3747
continue
3848
if file_path.is_file():
3949
await asyncio.to_thread(file_path.unlink, missing_ok=True)
@@ -55,3 +65,15 @@ async def _process_input_json(self, path: Path) -> None:
5565
f.close()
5666
await asyncio.to_thread(path.unlink, missing_ok=True)
5767
await self.set_value(key=path.stem, value=input_data)
68+
69+
@override
70+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
71+
configuration = Configuration.get_global_configuration()
72+
73+
if key in configuration.input_key_candidates:
74+
for candidate in configuration.input_key_candidates:
75+
value = await super().get_value(key=candidate)
76+
if value is not None:
77+
return value
78+
79+
return await super().get_value(key=key)

0 commit comments

Comments
 (0)