apify
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/crawlee/_service_locator.py‎
Lines changed: 2 additions & 6 deletions b/‎src/crawlee/_service_locator.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/crawlee/_types.py‎
Lines changed: 0 additions & 4 deletions b/‎src/crawlee/_types.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/crawlee/fingerprint_suite/_browserforge_adapter.py‎
Lines changed: 3 additions & 3 deletions b/‎src/crawlee/fingerprint_suite/_browserforge_adapter.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/crawlee/storage_clients/__init__.py‎
Lines changed: 7 additions & 2 deletions b/‎src/crawlee/storage_clients/__init__.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/crawlee/storage_clients/_base/__init__.py‎
Lines changed: 0 additions & 5 deletions b/‎src/crawlee/storage_clients/_base/__init__.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎src/crawlee/storage_clients/_base/_dataset_client.py‎
Lines changed: 66 additions & 148 deletions b/‎src/crawlee/storage_clients/_base/_dataset_client.py‎
Lines changed: 66 additions & 148 deletions
@@ -142,7 +142,6 @@ ignore = [
     "PLR0911",  # Too many return statements
     "PLR0913",  # Too many arguments in function definition
     "PLR0915",  # Too many statements
-    "PTH",      # flake8-use-pathlib
     "PYI034",   # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
     "PYI036",   # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
     "S102",     # Use of `exec` detected
 
@@ -77,13 +77,9 @@ def set_event_manager(self, event_manager: EventManager) -> None:
     def get_storage_client(self) -> StorageClient:
         """Get the storage client."""
         if self._storage_client is None:
-            from crawlee.storage_clients import MemoryStorageClient
+            from crawlee.storage_clients import file_system_storage_client
 
-            self._storage_client = (
-                MemoryStorageClient.from_config(config=self._configuration)
-                if self._configuration
-                else MemoryStorageClient.from_config()
-            )
+            self._storage_client = file_system_storage_client
 
         self._storage_client_was_retrieved = True
         return self._storage_client
 
@@ -274,10 +274,6 @@ async def push_data(
         **kwargs: Unpack[PushDataKwargs],
     ) -> None:
         """Track a call to the `push_data` context helper."""
-        from crawlee.storages._dataset import Dataset
-
-        await Dataset.check_and_serialize(data)
-
         self.push_data_calls.append(
             PushDataFunctionCall(
                 data=data,
 
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
-import os.path
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import reduce
 from operator import or_
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
 from browserforge.bayesian_network import extract_json
@@ -253,9 +253,9 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str,
 
 def get_available_header_network() -> dict:
     """Get header network that contains possible header values."""
-    if os.path.isfile(DATA_DIR / 'header-network.zip'):
+    if Path(DATA_DIR / 'header-network.zip').is_file():
         return extract_json(DATA_DIR / 'header-network.zip')
-    if os.path.isfile(DATA_DIR / 'header-network-definition.zip'):
+    if Path(DATA_DIR / 'header-network-definition.zip').is_file():
         return extract_json(DATA_DIR / 'header-network-definition.zip')
     raise FileNotFoundError('Missing header-network file.')
 
 
@@ -1,4 +1,9 @@
 from ._base import StorageClient
-from ._memory import MemoryStorageClient
+from ._file_system import file_system_storage_client
+from ._memory import memory_storage_client
 
-__all__ = ['MemoryStorageClient', 'StorageClient']
+__all__ = [
+    'StorageClient',
+    'file_system_storage_client',
+    'memory_storage_client'
+]
@@ -2,15 +2,10 @@
 from ._key_value_store_client import KeyValueStoreClient
 from ._request_queue_client import RequestQueueClient
 from ._storage_client import StorageClient
-from ._types import ResourceClient
 
 __all__ = [
     'DatasetClient',
-    'DatasetCollectionClient',
     'KeyValueStoreClient',
-    'KeyValueStoreCollectionClient',
     'RequestQueueClient',
-    'RequestQueueCollectionClient',
-    'ResourceClient',
     'StorageClient',
 ]
@@ -7,12 +7,11 @@
 
 if TYPE_CHECKING:
     from collections.abc import AsyncIterator
-    from contextlib import AbstractAsyncContextManager
+    from datetime import datetime
+    from pathlib import Path
+    from typing import Any
 
-    from httpx import Response
-
-    from crawlee._types import JsonSerializable
-    from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
+    from crawlee.storage_clients.models import DatasetItemsListPage
 
 
 @docs_group('Abstract classes')
@@ -23,35 +22,78 @@ class DatasetClient(ABC):
     client, like a memory storage client.
     """
 
-    _LIST_ITEMS_LIMIT = 999_999_999_999
-    """This is what API returns in the x-apify-pagination-limit header when no limit query parameter is used."""
+    @property
+    @abstractmethod
+    def id(self) -> str:
+        """The ID of the dataset."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str | None:
+        """The name of the dataset."""
+
+    @property
+    @abstractmethod
+    def created_at(self) -> datetime:
+        """The time at which the dataset was created."""
+
+    @property
+    @abstractmethod
+    def accessed_at(self) -> datetime:
+        """The time at which the dataset was last accessed."""
 
+    @property
     @abstractmethod
-    async def get(self) -> DatasetMetadata | None:
-        """Get metadata about the dataset being managed by this client.
+    def modified_at(self) -> datetime:
+        """The time at which the dataset was last modified."""
+
+    @property
+    @abstractmethod
+    def item_count(self) -> int:
+        """The number of items in the dataset."""
+
+    @classmethod
+    @abstractmethod
+    async def open(
+        cls,
+        id: str | None,
+        name: str | None,
+        storage_dir: Path,
+    ) -> DatasetClient:
+        """Open existing or create a new dataset client.
+
+        If a dataset with the given name already exists, the appropriate dataset client is returned.
+        Otherwise, a new dataset is created and client for it is returned.
+
+        Args:
+            id: The ID of the dataset.
+            name: The name of the dataset.
+            storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
 
         Returns:
-            An object containing the dataset's details, or None if the dataset does not exist.
+            A dataset client.
         """
 
     @abstractmethod
-    async def delete(self) -> None:
-        """Permanently delete the dataset managed by this client."""
+    async def drop(self) -> None:
+        """Drop the whole dataset and remove all its items.
+
+        The backend method for the `Dataset.drop` call.
+        """
 
     @abstractmethod
-    async def push_items(self, items: JsonSerializable) -> None:
-        """Push items to the dataset.
+    async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
+        """Push data to the dataset.
 
-        Args:
-            items: The items which to push in the dataset. They must be JSON serializable.
+        The backend method for the `Dataset.push_data` call.
         """
 
     @abstractmethod
-    async def list_items(
+    async def get_data(
         self,
         *,
-        offset: int | None = 0,
-        limit: int | None = _LIST_ITEMS_LIMIT,
+        offset: int = 0,
+        limit: int | None = 999_999_999_999,
         clean: bool = False,
         desc: bool = False,
         fields: list[str] | None = None,
@@ -62,31 +104,13 @@ async def list_items(
         flatten: list[str] | None = None,
         view: str | None = None,
     ) -> DatasetItemsListPage:
-        """Retrieve a paginated list of items from a dataset based on various filtering parameters.
-
-        This method provides the flexibility to filter, sort, and modify the appearance of dataset items
-        when listed. Each parameter modifies the result set according to its purpose. The method also
-        supports pagination through 'offset' and 'limit' parameters.
-
-        Args:
-            offset: The number of initial items to skip.
-            limit: The maximum number of items to return.
-            clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
-            desc: If True, items are returned in descending order, i.e., newest first.
-            fields: Specifies a subset of fields to include in each item.
-            omit: Specifies a subset of fields to exclude from each item.
-            unwind: Specifies a field that should be unwound. If it's an array, each element becomes a separate record.
-            skip_empty: If True, omits items that are empty after other filters have been applied.
-            skip_hidden: If True, omits fields starting with the '#' character.
-            flatten: A list of fields to flatten in each item.
-            view: The specific view of the dataset to use when retrieving items.
+        """Get data from the dataset.
 
-        Returns:
-            An object with filtered, sorted, and paginated dataset items plus pagination details.
+        The backend method for the `Dataset.get_data` call.
         """
 
     @abstractmethod
-    async def iterate_items(
+    async def iterate(
         self,
         *,
         offset: int = 0,
@@ -99,118 +123,12 @@ async def iterate_items(
         skip_empty: bool = False,
         skip_hidden: bool = False,
     ) -> AsyncIterator[dict]:
-        """Iterate over items in the dataset according to specified filters and sorting.
-
-        This method allows for asynchronously iterating through dataset items while applying various filters such as
-        skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`
-        parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and
-        `skip_hidden` parameters.
+        """Iterate over the dataset.
 
-        Args:
-            offset: The number of initial items to skip.
-            limit: The maximum number of items to iterate over. None means no limit.
-            clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'.
-            desc: If set to True, items are returned in descending order, i.e., newest first.
-            fields: Specifies a subset of fields to include in each item.
-            omit: Specifies a subset of fields to exclude from each item.
-            unwind: Specifies a field that should be unwound into separate items.
-            skip_empty: If set to True, omits items that are empty after other filters have been applied.
-            skip_hidden: If set to True, omits fields starting with the '#' character from the output.
-
-        Yields:
-            An asynchronous iterator of dictionary objects, each representing a dataset item after applying
-            the specified filters and transformations.
+        The backend method for the `Dataset.iterate` call.
         """
         # This syntax is to make mypy properly work with abstract AsyncIterator.
         # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
         raise NotImplementedError
         if False:  # type: ignore[unreachable]
             yield 0
-
-    @abstractmethod
-    async def get_items_as_bytes(
-        self,
-        *,
-        item_format: str = 'json',
-        offset: int | None = None,
-        limit: int | None = None,
-        desc: bool = False,
-        clean: bool = False,
-        bom: bool = False,
-        delimiter: str | None = None,
-        fields: list[str] | None = None,
-        omit: list[str] | None = None,
-        unwind: str | None = None,
-        skip_empty: bool = False,
-        skip_header_row: bool = False,
-        skip_hidden: bool = False,
-        xml_root: str | None = None,
-        xml_row: str | None = None,
-        flatten: list[str] | None = None,
-    ) -> bytes:
-        """Retrieve dataset items as bytes.
-
-        Args:
-            item_format: Output format (e.g., 'json', 'csv'); default is 'json'.
-            offset: Number of items to skip; default is 0.
-            limit: Max number of items to return; no default limit.
-            desc: If True, results are returned in descending order.
-            clean: If True, filters out empty items and hidden fields.
-            bom: Include or exclude UTF-8 BOM; default behavior varies by format.
-            delimiter: Delimiter character for CSV; default is ','.
-            fields: List of fields to include in the results.
-            omit: List of fields to omit from the results.
-            unwind: Unwinds a field into separate records.
-            skip_empty: If True, skips empty items in the output.
-            skip_header_row: If True, skips the header row in CSV.
-            skip_hidden: If True, skips hidden fields in the output.
-            xml_root: Root element name for XML output; default is 'items'.
-            xml_row: Element name for each item in XML output; default is 'item'.
-            flatten: List of fields to flatten.
-
-        Returns:
-            The dataset items as raw bytes.
-        """
-
-    @abstractmethod
-    async def stream_items(
-        self,
-        *,
-        item_format: str = 'json',
-        offset: int | None = None,
-        limit: int | None = None,
-        desc: bool = False,
-        clean: bool = False,
-        bom: bool = False,
-        delimiter: str | None = None,
-        fields: list[str] | None = None,
-        omit: list[str] | None = None,
-        unwind: str | None = None,
-        skip_empty: bool = False,
-        skip_header_row: bool = False,
-        skip_hidden: bool = False,
-        xml_root: str | None = None,
-        xml_row: str | None = None,
-    ) -> AbstractAsyncContextManager[Response | None]:
-        """Retrieve dataset items as a streaming response.
-
-        Args:
-            item_format: Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json.
-            offset: Number of items to skip at the start; default is 0.
-            limit: Maximum number of items to return; no default limit.
-            desc: If True, reverses the order of results.
-            clean: If True, filters out empty items and hidden fields.
-            bom: Include or exclude UTF-8 BOM; varies by format.
-            delimiter: Delimiter for CSV files; default is ','.
-            fields: List of fields to include in the output.
-            omit: List of fields to omit from the output.
-            unwind: Unwinds a field into separate records.
-            skip_empty: If True, empty items are omitted.
-            skip_header_row: If True, skips the header row in CSV.
-            skip_hidden: If True, hides fields starting with the # character.
-            xml_root: Custom root element name for XML output; default is 'items'.
-            xml_row: Custom element name for each item in XML; default is 'item'.
-
-        Yields:
-            The dataset items in a streaming response.
-        """