Skip to content

Commit b00876e

Browse files
authored
fix: fix handling of loading empty metadata file for queue (#1042)
### Description - Error handling if an empty metadata file was created ### Issues - Closes: #1029
1 parent a4f85ef commit b00876e

File tree

3 files changed

+50
-18
lines changed

3 files changed

+50
-18
lines changed

src/crawlee/storage_clients/_memory/_creation_management.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -409,9 +409,16 @@ def _determine_storage_path(
409409
metadata_path = os.path.join(entry.path, METADATA_FILENAME)
410410
if os.access(metadata_path, os.F_OK):
411411
with open(metadata_path, encoding='utf-8') as metadata_file:
412-
metadata = json.load(metadata_file)
413-
if (id and metadata.get('id') == id) or (name and metadata.get('name') == name):
414-
return entry.path
412+
try:
413+
metadata = json.load(metadata_file)
414+
if (id and metadata.get('id') == id) or (name and metadata.get('name') == name):
415+
return entry.path
416+
except Exception:
417+
logger.warning(
418+
f'Metadata of store entry "{entry.name}" for store {name or id} could not be parsed. '
419+
'The metadata file will be ignored.',
420+
exc_info=True,
421+
)
415422

416423
# Check for default storage directory as a last resort
417424
if id == default_id:

src/crawlee/storage_clients/_memory/_key_value_store_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,13 +334,13 @@ async def persist_record(self, record: KeyValueStoreRecord) -> None:
334334
await asyncio.to_thread(f.close)
335335

336336
if self._memory_storage_client.write_metadata:
337-
f = await asyncio.to_thread(open, record_metadata_path, mode='wb')
337+
metadata_f = await asyncio.to_thread(open, record_metadata_path, mode='wb')
338338

339339
try:
340340
record_metadata = KeyValueStoreRecordMetadata(key=record.key, content_type=record.content_type)
341-
await asyncio.to_thread(f.write, record_metadata.model_dump_json(indent=2).encode('utf-8'))
341+
await asyncio.to_thread(metadata_f.write, record_metadata.model_dump_json(indent=2).encode('utf-8'))
342342
finally:
343-
await asyncio.to_thread(f.close)
343+
await asyncio.to_thread(metadata_f.close)
344344

345345
async def delete_persisted_record(self, record: KeyValueStoreRecord) -> None:
346346
"""Delete the specified record from the key-value store."""
Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
from __future__ import annotations
22

33
import json
4-
import os
5-
from typing import TYPE_CHECKING
4+
from pathlib import Path
5+
from unittest.mock import AsyncMock, patch
6+
7+
import pytest
68

79
from crawlee._consts import METADATA_FILENAME
810
from crawlee.storage_clients._memory._creation_management import persist_metadata_if_enabled
911

10-
if TYPE_CHECKING:
11-
from pathlib import Path
12-
1312

1413
async def test_persist_metadata_skips_when_disabled(tmp_path: Path) -> None:
1514
await persist_metadata_if_enabled(data={'key': 'value'}, entity_directory=str(tmp_path), write_metadata=False)
@@ -18,17 +17,43 @@ async def test_persist_metadata_skips_when_disabled(tmp_path: Path) -> None:
1817

1918
async def test_persist_metadata_creates_files_and_directories_when_enabled(tmp_path: Path) -> None:
2019
data = {'key': 'value'}
21-
entity_directory = os.path.join(tmp_path, 'new_dir')
22-
await persist_metadata_if_enabled(data=data, entity_directory=entity_directory, write_metadata=True)
23-
assert os.path.exists(entity_directory) # Check if directory was created
24-
assert os.path.isfile(os.path.join(entity_directory, METADATA_FILENAME)) # Check if file was created
20+
entity_directory = Path(tmp_path, 'new_dir')
21+
await persist_metadata_if_enabled(data=data, entity_directory=str(entity_directory), write_metadata=True)
22+
assert entity_directory.exists() is True # Check if directory was created
23+
assert (entity_directory / METADATA_FILENAME).is_file() # Check if file was created
2524

2625

2726
async def test_persist_metadata_correctly_writes_data(tmp_path: Path) -> None:
2827
data = {'key': 'value'}
29-
entity_directory = os.path.join(tmp_path, 'data_dir')
30-
await persist_metadata_if_enabled(data=data, entity_directory=entity_directory, write_metadata=True)
31-
metadata_path = os.path.join(entity_directory, METADATA_FILENAME)
28+
entity_directory = Path(tmp_path, 'data_dir')
29+
await persist_metadata_if_enabled(data=data, entity_directory=str(entity_directory), write_metadata=True)
30+
metadata_path = entity_directory / METADATA_FILENAME
3231
with open(metadata_path) as f: # noqa: ASYNC230
3332
content = f.read()
3433
assert json.loads(content) == data # Check if correct data was written
34+
35+
36+
async def test_persist_metadata_rewrites_data_with_error(tmp_path: Path) -> None:
37+
init_data = {'key': 'very_long_value'}
38+
update_data = {'key': 'short_value'}
39+
error_data = {'key': 'error'}
40+
41+
entity_directory = Path(tmp_path, 'data_dir')
42+
metadata_path = entity_directory / METADATA_FILENAME
43+
44+
# write metadata with init_data
45+
await persist_metadata_if_enabled(data=init_data, entity_directory=str(entity_directory), write_metadata=True)
46+
47+
# rewrite metadata with new_data
48+
await persist_metadata_if_enabled(data=update_data, entity_directory=str(entity_directory), write_metadata=True)
49+
with open(metadata_path) as f: # noqa: ASYNC230
50+
content = f.read()
51+
assert json.loads(content) == update_data # Check if correct data was rewritten
52+
53+
# raise interrupt between opening a file and writing
54+
module_for_patch = 'crawlee.storage_clients._memory._creation_management.json_dumps'
55+
with patch(module_for_patch, AsyncMock(side_effect=KeyboardInterrupt())), pytest.raises(KeyboardInterrupt):
56+
await persist_metadata_if_enabled(data=error_data, entity_directory=str(entity_directory), write_metadata=True)
57+
with open(metadata_path) as f: # noqa: ASYNC230
58+
content = f.read()
59+
assert content == '' # The file is empty after an error

0 commit comments

Comments
 (0)