Skip to content

Commit cd9adf1

Browse files
authored
fix: double incrementation of item_count (#443)
### Description - `item_count` unexpected increment when loaded from metadata ### Issues - Closes: #442 ### Testing - Added `test_reuse_dataset` test ### Checklist - [x] CI passed
1 parent 4367cc2 commit cd9adf1

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

src/crawlee/memory_storage_client/_creation_management.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def create_dataset_from_directory(
181181
from crawlee.memory_storage_client._dataset_client import DatasetClient
182182

183183
item_count = 0
184+
has_seen_metadata_file = False
184185
created_at = datetime.now(timezone.utc)
185186
accessed_at = datetime.now(timezone.utc)
186187
modified_at = datetime.now(timezone.utc)
@@ -189,6 +190,7 @@ def create_dataset_from_directory(
189190
metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME)
190191

191192
if os.path.exists(metadata_filepath):
193+
has_seen_metadata_file = True
192194
with open(metadata_filepath, encoding='utf-8') as f:
193195
json_content = json.load(f)
194196
resource_info = DatasetMetadata(**json_content)
@@ -202,7 +204,6 @@ def create_dataset_from_directory(
202204

203205
# Load dataset entries
204206
entries: dict[str, dict] = {}
205-
has_seen_metadata_file = False
206207

207208
for entry in os.scandir(storage_directory):
208209
if entry.is_file():

tests/unit/_memory_storage_client/test_dataset_client.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,13 @@ async def test_iterate_items(dataset_client: DatasetClient) -> None:
138138
assert len(actual_items) == item_count
139139
assert actual_items[0]['id'] == 0
140140
assert actual_items[99]['id'] == 99
141+
142+
143+
async def test_reuse_dataset(dataset_client: DatasetClient, memory_storage_client: MemoryStorageClient) -> None:
144+
item_count = 10
145+
await dataset_client.push_items([{'id': i} for i in range(item_count)])
146+
147+
memory_storage_client.datasets_handled = [] # purge datasets loaded to test create_dataset_from_directory
148+
datasets_client = memory_storage_client.datasets()
149+
dataset_info = await datasets_client.get_or_create(name='test')
150+
assert dataset_info.item_count == item_count

0 commit comments

Comments
 (0)