Skip to content

Commit 9aaee11

Browse files
committed
Cleanup
1 parent d0ee2fa commit 9aaee11

20 files changed

+343
-790
lines changed

src/crawlee/_consts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from __future__ import annotations
22

33
METADATA_FILENAME = '__metadata__.json'
4+
"""The name of the metadata file for storage clients."""

src/crawlee/_types.py

Lines changed: 150 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,41 @@
33
import dataclasses
44
from collections.abc import Iterator, Mapping
55
from dataclasses import dataclass
6-
from enum import Enum
7-
from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload
6+
from typing import (
7+
TYPE_CHECKING,
8+
Annotated,
9+
Any,
10+
Callable,
11+
Literal,
12+
Optional,
13+
Protocol,
14+
TypedDict,
15+
TypeVar,
16+
Union,
17+
cast,
18+
overload,
19+
)
820

921
from pydantic import ConfigDict, Field, PlainValidator, RootModel
10-
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
1122

1223
from crawlee._utils.docs import docs_group
1324

1425
if TYPE_CHECKING:
26+
import json
1527
import logging
1628
import re
17-
from collections.abc import Coroutine, Sequence
29+
from collections.abc import Callable, Coroutine, Sequence
30+
31+
from typing_extensions import NotRequired, Required, TypeAlias, Unpack
1832

1933
from crawlee import Glob, Request
2034
from crawlee._request import RequestOptions
35+
from crawlee.configuration import Configuration
2136
from crawlee.http_clients import HttpResponse
2237
from crawlee.proxy_configuration import ProxyInfo
2338
from crawlee.sessions import Session
24-
from crawlee.storage_clients.models import DatasetItemsListPage
39+
from crawlee.storage_clients import StorageClient
2540
from crawlee.storages import KeyValueStore
26-
from crawlee.storages._types import ExportToKwargs, GetDataKwargs
2741

2842
# Workaround for https://github.com/pydantic/pydantic/issues/9445
2943
J = TypeVar('J', bound='JsonSerializable')
@@ -138,15 +152,6 @@ def __init__(
138152
self.max_tasks_per_minute = max_tasks_per_minute
139153

140154

141-
@docs_group('Data structures')
142-
class StorageTypes(str, Enum):
143-
"""Possible Crawlee storage types."""
144-
145-
DATASET = 'Dataset'
146-
KEY_VALUE_STORE = 'Key-value store'
147-
REQUEST_QUEUE = 'Request queue'
148-
149-
150155
class EnqueueLinksKwargs(TypedDict):
151156
"""Keyword arguments for the `enqueue_links` methods."""
152157

@@ -416,55 +421,6 @@ def __call__(
416421
"""
417422

418423

419-
@docs_group('Functions')
420-
class ExportToFunction(Protocol):
421-
"""A function for exporting data from a `Dataset`.
422-
423-
It simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports
424-
its content to a `KeyValueStore`.
425-
"""
426-
427-
def __call__(
428-
self,
429-
dataset_id: str | None = None,
430-
dataset_name: str | None = None,
431-
**kwargs: Unpack[ExportToKwargs],
432-
) -> Coroutine[None, None, None]:
433-
"""Call dunder method.
434-
435-
Args:
436-
dataset_id: The ID of the `Dataset` to export data from.
437-
dataset_name: The name of the `Dataset` to export data from.
438-
**kwargs: Additional keyword arguments.
439-
"""
440-
441-
442-
@docs_group('Functions')
443-
class GetDataFunction(Protocol):
444-
"""A function for retrieving data from a `Dataset`.
445-
446-
It simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves
447-
data based on the provided parameters. It allows filtering and pagination.
448-
"""
449-
450-
def __call__(
451-
self,
452-
dataset_id: str | None = None,
453-
dataset_name: str | None = None,
454-
**kwargs: Unpack[GetDataKwargs],
455-
) -> Coroutine[None, None, DatasetItemsListPage]:
456-
"""Call dunder method.
457-
458-
Args:
459-
dataset_id: ID of the `Dataset` to get data from.
460-
dataset_name: Name of the `Dataset` to get data from.
461-
**kwargs: Additional keyword arguments.
462-
463-
Returns:
464-
A page of retrieved items.
465-
"""
466-
467-
468424
@docs_group('Functions')
469425
class GetKeyValueStoreFunction(Protocol):
470426
"""A function for accessing a `KeyValueStore`.
@@ -575,18 +531,6 @@ def __bool__(self) -> bool:
575531
return bool(self.screenshot or self.html)
576532

577533

578-
@docs_group('Functions')
579-
class GetPageSnapshot(Protocol):
580-
"""A function for getting snapshot of a page."""
581-
582-
def __call__(self) -> Coroutine[None, None, PageSnapshot]:
583-
"""Get page snapshot.
584-
585-
Returns:
586-
Snapshot of a page.
587-
"""
588-
589-
590534
@docs_group('Functions')
591535
class UseStateFunction(Protocol):
592536
"""A function for managing state within the crawling context.
@@ -654,3 +598,133 @@ async def get_snapshot(self) -> PageSnapshot:
654598
def __hash__(self) -> int:
655599
"""Return hash of the context. Each context is considered unique."""
656600
return id(self)
601+
602+
603+
class GetDataKwargs(TypedDict):
604+
"""Keyword arguments for dataset's `get_data` method."""
605+
606+
offset: NotRequired[int]
607+
"""Skips the specified number of items at the start."""
608+
609+
limit: NotRequired[int | None]
610+
"""The maximum number of items to retrieve. Unlimited if None."""
611+
612+
clean: NotRequired[bool]
613+
"""Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty."""
614+
615+
desc: NotRequired[bool]
616+
"""Set to True to sort results in descending order."""
617+
618+
fields: NotRequired[list[str]]
619+
"""Fields to include in each item. Sorts fields as specified if provided."""
620+
621+
omit: NotRequired[list[str]]
622+
"""Fields to exclude from each item."""
623+
624+
unwind: NotRequired[str]
625+
"""Unwinds items by a specified array field, turning each element into a separate item."""
626+
627+
skip_empty: NotRequired[bool]
628+
"""Excludes empty items from the results if True."""
629+
630+
skip_hidden: NotRequired[bool]
631+
"""Excludes fields starting with '#' if True."""
632+
633+
flatten: NotRequired[list[str]]
634+
"""Fields to be flattened in returned items."""
635+
636+
view: NotRequired[str]
637+
"""Specifies the dataset view to be used."""
638+
639+
640+
class ExportToKwargs(TypedDict):
641+
"""Keyword arguments for dataset's `export_to` method."""
642+
643+
key: Required[str]
644+
"""The key under which to save the data."""
645+
646+
content_type: NotRequired[Literal['json', 'csv']]
647+
"""The format in which to export the data. Either 'json' or 'csv'."""
648+
649+
to_kvs_id: NotRequired[str]
650+
"""ID of the key-value store to save the exported file."""
651+
652+
to_kvs_name: NotRequired[str]
653+
"""Name of the key-value store to save the exported file."""
654+
655+
to_kvs_storage_client: NotRequired[StorageClient]
656+
"""The storage client to use for saving the exported file."""
657+
658+
to_kvs_configuration: NotRequired[Configuration]
659+
"""The configuration to use for saving the exported file."""
660+
661+
662+
class ExportDataJsonKwargs(TypedDict):
663+
"""Keyword arguments for dataset's `export_data_json` method."""
664+
665+
skipkeys: NotRequired[bool]
666+
"""If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped
667+
instead of raising a `TypeError`."""
668+
669+
ensure_ascii: NotRequired[bool]
670+
"""Determines if non-ASCII characters should be escaped in the output JSON string."""
671+
672+
check_circular: NotRequired[bool]
673+
"""If False (default: True), skips the circular reference check for container types. A circular reference will
674+
result in a `RecursionError` or worse if unchecked."""
675+
676+
allow_nan: NotRequired[bool]
677+
"""If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply
678+
with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)."""
679+
680+
cls: NotRequired[type[json.JSONEncoder]]
681+
"""Allows specifying a custom JSON encoder."""
682+
683+
indent: NotRequired[int]
684+
"""Specifies the number of spaces for indentation in the pretty-printed JSON output."""
685+
686+
separators: NotRequired[tuple[str, str]]
687+
"""A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')
688+
otherwise."""
689+
690+
default: NotRequired[Callable]
691+
"""A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version
692+
of the object or raise a `TypeError`."""
693+
694+
sort_keys: NotRequired[bool]
695+
"""Specifies whether the output JSON object should have keys sorted alphabetically."""
696+
697+
698+
class ExportDataCsvKwargs(TypedDict):
699+
"""Keyword arguments for dataset's `export_data_csv` method."""
700+
701+
dialect: NotRequired[str]
702+
"""Specifies a dialect to be used in CSV parsing and writing."""
703+
704+
delimiter: NotRequired[str]
705+
"""A one-character string used to separate fields. Defaults to ','."""
706+
707+
doublequote: NotRequired[bool]
708+
"""Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;
709+
when False, the `escapechar` is used as a prefix. Defaults to True."""
710+
711+
escapechar: NotRequired[str]
712+
"""A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`
713+
if `doublequote` is False. Defaults to None, disabling escaping."""
714+
715+
lineterminator: NotRequired[str]
716+
"""The string used to terminate lines produced by the writer. Defaults to '\\r\\n'."""
717+
718+
quotechar: NotRequired[str]
719+
"""A one-character string used to quote fields containing special characters, like the delimiter or quotechar,
720+
or fields containing new-line characters. Defaults to '\"'."""
721+
722+
quoting: NotRequired[int]
723+
"""Controls when quotes should be generated by the writer and recognized by the reader. Can take any of
724+
the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`."""
725+
726+
skipinitialspace: NotRequired[bool]
727+
"""When True, spaces immediately following the delimiter are ignored. Defaults to False."""
728+
729+
strict: NotRequired[bool]
730+
"""When True, raises an exception on bad CSV input. Defaults to False."""

src/crawlee/_utils/data_processing.py

Lines changed: 0 additions & 41 deletions
This file was deleted.

0 commit comments

Comments
 (0)