Fixing integration test test_push_large_data_chunks_over_9mb

vdusek · vdusek · commit 7b5ee07ea792 · 2025-06-27T13:37:14.000+02:00
diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py
@@ -7,6 +7,8 @@
 from typing_extensions import override
 
 from apify_client import ApifyClientAsync
+from crawlee._utils.byte_size import ByteSize
+from crawlee._utils.file import json_dumps
 from crawlee.storage_clients._base import DatasetClient
 from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
 
@@ -15,6 +17,7 @@
     from datetime import datetime
 
     from apify_client.clients import DatasetClientAsync
+    from crawlee._types import JsonSerializable
     from crawlee.configuration import Configuration
 
 logger = getLogger(__name__)
@@ -23,6 +26,15 @@
 class ApifyDatasetClient(DatasetClient):
     """An Apify platform implementation of the dataset client."""
 
+    _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)
+    """Maximum size for a single payload."""
+
+    _SAFETY_BUFFER_PERCENT = 0.01 / 100  # 0.01%
+    """Percentage buffer to reduce payload limit slightly for safety."""
+
+    _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT)
+    """Calculated payload limit considering safety buffer."""
+
     def __init__(
         self,
         *,
@@ -135,8 +147,22 @@ async def drop(self) -> None:
 
     @override
     async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
+        async def payloads_generator() -> AsyncIterator[str]:
+            for index, item in enumerate(data):
+                yield await self._check_and_serialize(item, index)
+
         async with self._lock:
-            await self._api_client.push_items(items=data)
+            # Handle lists
+            if isinstance(data, list):
+                # Invoke client in series to preserve the order of data
+                async for items in self._chunk_by_size(payloads_generator()):
+                    await self._api_client.push_items(items=items)
+
+            # Handle singular items
+            else:
+                items = await self._check_and_serialize(data)
+                await self._api_client.push_items(items=items)
+
             await self._update_metadata()
 
     @override
@@ -205,3 +231,60 @@ async def _update_metadata(self) -> None:
         """Update the dataset metadata file with current information."""
         metadata = await self._api_client.get()
         self._metadata = DatasetMetadata.model_validate(metadata)
+
+    @classmethod
+    async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str:
+        """Serialize a given item to JSON, checks its serializability and size against a limit.
+
+        Args:
+            item: The item to serialize.
+            index: Index of the item, used for error context.
+
+        Returns:
+            Serialized JSON string.
+
+        Raises:
+            ValueError: If item is not JSON serializable or exceeds size limit.
+        """
+        s = ' ' if index is None else f' at index {index} '
+
+        try:
+            payload = await json_dumps(item)
+        except Exception as exc:
+            raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
+
+        payload_size = ByteSize(len(payload.encode('utf-8')))
+        if payload_size > cls._EFFECTIVE_LIMIT_SIZE:
+            raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})')
+
+        return payload
+
+    async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]:
+        """Yield chunks of JSON arrays composed of input strings, respecting a size limit.
+
+        Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size
+        of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that
+        contains as many payloads as possible without breaching the size threshold, maintaining the
+        order of the original payloads. Assumes individual items are below the size limit.
+
+        Args:
+            items: Iterable of JSON string payloads.
+
+        Yields:
+            Strings representing JSON arrays of payloads, each staying within the size limit.
+        """
+        last_chunk_size = ByteSize(2)  # Add 2 bytes for [] wrapper.
+        current_chunk = []
+
+        async for payload in items:
+            payload_size = ByteSize(len(payload.encode('utf-8')))
+
+            if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE:
+                current_chunk.append(payload)
+                last_chunk_size += payload_size + ByteSize(1)  # Add 1 byte for ',' separator.
+            else:
+                yield f'[{",".join(current_chunk)}]'
+                current_chunk = [payload]
+                last_chunk_size = payload_size + ByteSize(2)  # Add 2 bytes for [] wrapper.
+
+        yield f'[{",".join(current_chunk)}]'