Skip to content

Commit ab9e9a4

Browse files
committed
push data type
1 parent 10cb7cb commit ab9e9a4

File tree

10 files changed

+12
-16
lines changed

10 files changed

+12
-16
lines changed

src/crawlee/_types.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ class PushDataKwargs(TypedDict):
188188

189189

190190
class PushDataFunctionCall(PushDataKwargs):
191-
data: JsonSerializable
191+
data: list[dict[str, Any]] | dict[str, Any]
192192
dataset_id: str | None
193193
dataset_name: str | None
194194

@@ -269,7 +269,7 @@ async def add_requests(
269269

270270
async def push_data(
271271
self,
272-
data: JsonSerializable,
272+
data: list[dict[str, Any]] | dict[str, Any],
273273
dataset_id: str | None = None,
274274
dataset_name: str | None = None,
275275
**kwargs: Unpack[PushDataKwargs],
@@ -514,7 +514,7 @@ class PushDataFunction(Protocol):
514514

515515
def __call__(
516516
self,
517-
data: JsonSerializable,
517+
data: list[dict[str, Any]] | dict[str, Any],
518518
dataset_id: str | None = None,
519519
dataset_name: str | None = None,
520520
**kwargs: Unpack[PushDataKwargs],

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,7 @@ async def export_data(
758758

759759
async def _push_data(
760760
self,
761-
data: JsonSerializable,
761+
data: list[dict[str, Any]] | dict[str, Any],
762762
dataset_id: str | None = None,
763763
dataset_name: str | None = None,
764764
**kwargs: Unpack[PushDataKwargs],

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ async def purge(self) -> None:
216216
)
217217

218218
@override
219-
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
219+
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
220220
new_item_count = self.metadata.item_count
221221

222222
# If data is a list, push each item individually.

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ async def open(
171171
metadata_path = rq_path / METADATA_FILENAME
172172

173173
# If the RQ directory exists, reconstruct the client from the metadata file.
174-
if rq_path.exists() and not configuration.purge_on_start:
174+
if rq_path.exists():
175175
# If metadata file is missing, raise an error.
176176
if not metadata_path.exists():
177177
raise ValueError(f'Metadata file not found for request queue "{name}"')
@@ -204,10 +204,6 @@ async def open(
204204

205205
# Otherwise, create a new dataset client.
206206
else:
207-
# If purge_on_start is true and the directory exists, remove it
208-
if configuration.purge_on_start and rq_path.exists():
209-
await asyncio.to_thread(shutil.rmtree, rq_path)
210-
211207
now = datetime.now(timezone.utc)
212208
client = cls(
213209
id=crypto_random_object_id(),

src/crawlee/storage_clients/_memory/_dataset_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ async def purge(self) -> None:
109109
)
110110

111111
@override
112-
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
112+
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
113113
new_item_count = self.metadata.item_count
114114

115115
if isinstance(data, list):

src/crawlee/storage_clients/_memory/_request_queue_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from datetime import datetime, timezone
44
from logging import getLogger
5-
from typing import TYPE_CHECKING, ClassVar
5+
from typing import TYPE_CHECKING
66

77
from typing_extensions import override
88

src/crawlee/storages/_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ async def drop(self) -> None:
158158
async def purge(self) -> None:
159159
await self._client.purge()
160160

161-
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
161+
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
162162
"""Store an object or an array of objects to the dataset.
163163
164164
The size of the data is limited by the receiving API and therefore `push_data()` will only

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ async def test_context_push_and_get_data_handler_error() -> None:
589589

590590
@crawler.router.default_handler
591591
async def handler(context: BasicCrawlingContext) -> None:
592-
await context.push_data('{"b": 2}')
592+
await context.push_data({'b': 2})
593593
raise RuntimeError('Watch me crash')
594594

595595
stats = await crawler.run(['https://a.com'])

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,6 @@ async def request_handler(context: HttpCrawlingContext) -> None:
291291
assert responses[0]['data'] == '', 'Response raw data should be empty when only form data is sent.'
292292

293293

294-
# @pytest.mark.skip(reason='TODO: broken, freezing')
295294
async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL) -> None:
296295
crawler = HttpCrawler(http_client=http_client)
297296
responses = []

tests/unit/storages/test_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
if TYPE_CHECKING:
1515
from collections.abc import AsyncGenerator
1616
from pathlib import Path
17+
from typing import Any
1718

1819
from crawlee.storage_clients import StorageClient
1920

@@ -328,7 +329,7 @@ async def test_list_items(dataset: Dataset) -> None:
328329
async def test_list_items_with_options(dataset: Dataset) -> None:
329330
"""Test that list_items respects filtering options."""
330331
# Add some items
331-
items = [
332+
items: list[dict[str, Any]] = [
332333
{'id': 1, 'name': 'Item 1'},
333334
{'id': 2, 'name': 'Item 2'},
334335
{'id': 3}, # Item with missing 'name' field

0 commit comments

Comments
 (0)