Skip to content

Commit 5977f37

Browse files
authored
feat: Add additional kwargs to Crawler's export_data (#1597)
### Description - Add additional kwargs to Crawler's `export_data`. - Update of existing doc examples. ### Issues - Closes: #526 ### Testing - New test for additional kwargs ### Checklist - [x] CI passed
1 parent 883355a commit 5977f37

File tree

6 files changed

+56
-10
lines changed

6 files changed

+56
-10
lines changed

docs/examples/code_examples/export_entire_dataset_to_file_csv.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
3030
await crawler.run(['https://crawlee.dev'])
3131

3232
# Export the entire dataset to a CSV file.
33-
await crawler.export_data(path='results.csv')
33+
# Use semicolon as delimiter and always quote strings.
34+
await crawler.export_data(path='results.csv', delimiter=';', quoting='all')
3435

3536

3637
if __name__ == '__main__':

docs/examples/code_examples/export_entire_dataset_to_file_json.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
3030
await crawler.run(['https://crawlee.dev'])
3131

3232
# Export the entire dataset to a JSON file.
33-
await crawler.export_data(path='results.json')
33+
# Set ensure_ascii=False to allow Unicode characters in the output.
34+
await crawler.export_data(path='results.json', ensure_ascii=False)
3435

3536

3637
if __name__ == '__main__':

docs/examples/export_entire_dataset_to_file.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
1111
import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py';
1212
import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py';
1313

14-
This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format.
14+
This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior.
1515

1616
:::note
1717

src/crawlee/_utils/file.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
163163
dst: TextIO,
164164
**kwargs: Unpack[ExportDataCsvKwargs],
165165
) -> None:
166+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170+
if 'lineterminator' not in kwargs:
171+
kwargs['lineterminator'] = '\n'
172+
166173
writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
167174
write_header = True
168175

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from contextlib import AsyncExitStack, suppress
1515
from datetime import timedelta
1616
from functools import partial
17+
from io import StringIO
1718
from pathlib import Path
1819
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
1920
from urllib.parse import ParseResult, urlparse
@@ -32,6 +33,8 @@
3233
from crawlee._types import (
3334
BasicCrawlingContext,
3435
EnqueueLinksKwargs,
36+
ExportDataCsvKwargs,
37+
ExportDataJsonKwargs,
3538
GetKeyValueStoreFromRequestHandlerFunction,
3639
HttpHeaders,
3740
HttpPayload,
@@ -41,7 +44,7 @@
4144
SkippedReason,
4245
)
4346
from crawlee._utils.docs import docs_group
44-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
4548
from crawlee._utils.recurring_task import RecurringTask
4649
from crawlee._utils.robots import RobotsTxtFile
4750
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -868,6 +871,7 @@ async def export_data(
868871
dataset_id: str | None = None,
869872
dataset_name: str | None = None,
870873
dataset_alias: str | None = None,
874+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
871875
) -> None:
872876
"""Export all items from a Dataset to a JSON or CSV file.
873877
@@ -880,6 +884,7 @@ async def export_data(
880884
dataset_id: The ID of the Dataset to export from.
881885
dataset_name: The name of the Dataset to export from (global scope, named storage).
882886
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
887+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
883888
"""
884889
dataset = await Dataset.open(
885890
id=dataset_id,
@@ -889,13 +894,18 @@ async def export_data(
889894
configuration=self._service_locator.get_configuration(),
890895
)
891896

892-
path = path if isinstance(path, Path) else Path(path)
893-
dst = path.open('w', newline='')
897+
path = Path(path)
894898

895899
if path.suffix == '.csv':
896-
await export_csv_to_stream(dataset.iterate_items(), dst)
900+
dst = StringIO()
901+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
902+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
903+
await atomic_write(path, dst.getvalue())
897904
elif path.suffix == '.json':
898-
await export_json_to_stream(dataset.iterate_items(), dst)
905+
dst = StringIO()
906+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
907+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
908+
await atomic_write(path, dst.getvalue())
899909
else:
900910
raise ValueError(f'Unsupported file extension: {path.suffix}')
901911

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,29 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
732732
{'id': 1, 'test': 'test'},
733733
{'id': 2, 'test': 'test'},
734734
]
735-
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
735+
736+
# On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
737+
# On Unix/Linux, \n remains as \n.
738+
if sys.platform == 'win32':
739+
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
740+
else:
741+
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'
742+
743+
744+
async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None:
745+
crawler = BasicCrawler()
746+
dataset = await Dataset.open()
747+
748+
await dataset.push_data({'z': 1, 'a': 2})
749+
750+
json_path = tmp_path / 'dataset.json'
751+
csv_path = tmp_path / 'dataset.csv'
752+
753+
await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':'))
754+
await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\n')
755+
756+
assert json_path.read_text() == '[{"a":2,"z":1}]'
757+
assert csv_path.read_text() == 'z;a\n1;2\n'
736758

737759

738760
async def test_context_push_and_export_data(tmp_path: Path) -> None:
@@ -754,7 +776,12 @@ async def handler(context: BasicCrawlingContext) -> None:
754776
{'id': 2, 'test': 'test'},
755777
]
756778

757-
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
779+
# On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
780+
# On Unix/Linux, \n remains as \n.
781+
if sys.platform == 'win32':
782+
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
783+
else:
784+
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'
758785

759786

760787
async def test_context_update_kv_store() -> None:

0 commit comments

Comments
 (0)