feat: Add additional kwargs to Crawler's export_data (#1597)

vdusek · web-flow · commit 5977f376b93a · 2025-12-05T13:59:30.000+01:00
### Description - Add additional kwargs to Crawler's `export_data`. - Update of existing doc examples. ### Issues - Closes: #526 ### Testing - New test for additional kwargs ### Checklist - [x] CI passed
diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_csv.py b/docs/examples/code_examples/export_entire_dataset_to_file_csv.py
@@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a CSV file.
-    await crawler.export_data(path='results.csv')
+    # Use semicolon as delimiter and always quote strings.
+    await crawler.export_data(path='results.csv', delimiter=';', quoting='all')
 
 
 if __name__ == '__main__':
diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_json.py b/docs/examples/code_examples/export_entire_dataset_to_file_json.py
@@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a JSON file.
-    await crawler.export_data(path='results.json')
+    # Set ensure_ascii=False to allow Unicode characters in the output.
+    await crawler.export_data(path='results.json', ensure_ascii=False)
 
 
 if __name__ == '__main__':
diff --git a/docs/examples/export_entire_dataset_to_file.mdx b/docs/examples/export_entire_dataset_to_file.mdx
@@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py';
 import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py';
 
-This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format.
+This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior.
 
 :::note
 
diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
     dst: TextIO,
     **kwargs: Unpack[ExportDataCsvKwargs],
 ) -> None:
+    # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
+    # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
+    # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
+    # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
+    if 'lineterminator' not in kwargs:
+        kwargs['lineterminator'] = '\n'
+
     writer = csv.writer(dst, **kwargs)  # type: ignore[arg-type]
     write_header = True
 
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -14,6 +14,7 @@
 from contextlib import AsyncExitStack, suppress
 from datetime import timedelta
 from functools import partial
+from io import StringIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
 from urllib.parse import ParseResult, urlparse
@@ -32,6 +33,8 @@
 from crawlee._types import (
     BasicCrawlingContext,
     EnqueueLinksKwargs,
+    ExportDataCsvKwargs,
+    ExportDataJsonKwargs,
     GetKeyValueStoreFromRequestHandlerFunction,
     HttpHeaders,
     HttpPayload,
@@ -41,7 +44,7 @@
     SkippedReason,
 )
 from crawlee._utils.docs import docs_group
-from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
+from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -868,6 +871,7 @@ async def export_data(
         dataset_id: str | None = None,
         dataset_name: str | None = None,
         dataset_alias: str | None = None,
+        **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],  # type: ignore[misc]
     ) -> None:
         """Export all items from a Dataset to a JSON or CSV file.
 
@@ -880,6 +884,7 @@ async def export_data(
             dataset_id: The ID of the Dataset to export from.
             dataset_name: The name of the Dataset to export from (global scope, named storage).
             dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
+            additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
         """
         dataset = await Dataset.open(
             id=dataset_id,
@@ -889,13 +894,18 @@ async def export_data(
             configuration=self._service_locator.get_configuration(),
         )
 
-        path = path if isinstance(path, Path) else Path(path)
-        dst = path.open('w', newline='')
+        path = Path(path)
 
         if path.suffix == '.csv':
-            await export_csv_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
+            await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
+            await atomic_write(path, dst.getvalue())
         elif path.suffix == '.json':
-            await export_json_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
+            await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
+            await atomic_write(path, dst.getvalue())
         else:
             raise ValueError(f'Unsupported file extension: {path.suffix}')
 
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -732,7 +732,29 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
         {'id': 1, 'test': 'test'},
         {'id': 2, 'test': 'test'},
     ]
-    assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
+
+    # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
+    # On Unix/Linux, \n remains as \n.
+    if sys.platform == 'win32':
+        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
+    else:
+        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'
+
+
+async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None:
+    crawler = BasicCrawler()
+    dataset = await Dataset.open()
+
+    await dataset.push_data({'z': 1, 'a': 2})
+
+    json_path = tmp_path / 'dataset.json'
+    csv_path = tmp_path / 'dataset.csv'
+
+    await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':'))
+    await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\n')
+
+    assert json_path.read_text() == '[{"a":2,"z":1}]'
+    assert csv_path.read_text() == 'z;a\n1;2\n'
 
 
 async def test_context_push_and_export_data(tmp_path: Path) -> None:
@@ -754,7 +776,12 @@ async def handler(context: BasicCrawlingContext) -> None:
         {'id': 2, 'test': 'test'},
     ]
 
-    assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
+    # On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
+    # On Unix/Linux, \n remains as \n.
+    if sys.platform == 'win32':
+        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
+    else:
+        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'
 
 
 async def test_context_update_kv_store() -> None: