Skip to content

Commit 80a4b99

Browse files
authored
feat: Redshift UNLOAD - add parallel parameter (#2507)
Signed-off-by: Anton Kukushkin <[email protected]>
1 parent f26e869 commit 80a4b99

File tree

3 files changed

+23
-1
lines changed

3 files changed

+23
-1
lines changed

awswrangler/redshift/_read.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ def unload_to_files(
224224
aws_session_token: Optional[str] = None,
225225
region: Optional[str] = None,
226226
unload_format: Optional[Literal["CSV", "PARQUET"]] = None,
227+
parallel: bool = True,
227228
max_file_size: Optional[float] = None,
228229
kms_key_id: Optional[str] = None,
229230
manifest: bool = False,
@@ -265,6 +266,11 @@ def unload_to_files(
265266
unload_format: str, optional
266267
Format of the unloaded S3 objects from the query.
267268
Valid values: "CSV", "PARQUET". Case sensitive. Defaults to PARQUET.
269+
parallel: bool
270+
Whether to unload to multiple files in parallel. Defaults to True.
271+
By default, UNLOAD writes data in parallel to multiple files, according to the number of
272+
slices in the cluster. If parallel is False, UNLOAD writes to one or more data files serially,
273+
sorted absolutely according to the ORDER BY clause, if one is used.
268274
max_file_size : float, optional
269275
Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3.
270276
Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default
@@ -305,6 +311,7 @@ def unload_to_files(
305311
partition_str: str = f"\nPARTITION BY ({','.join(partition_cols)})" if partition_cols else ""
306312
manifest_str: str = "\nmanifest" if manifest is True else ""
307313
region_str: str = f"\nREGION AS '{region}'" if region is not None else ""
314+
parallel_str: str = "\nPARALLEL ON" if parallel else "\nPARALLEL OFF"
308315
if not max_file_size and engine.get() == EngineEnum.RAY:
309316
_logger.warning(
310317
"Unload `MAXFILESIZE` is not specified. "
@@ -330,7 +337,7 @@ def unload_to_files(
330337
f"TO '{path}'\n"
331338
f"{auth_str}"
332339
"ALLOWOVERWRITE\n"
333-
"PARALLEL ON\n"
340+
f"{parallel_str}\n"
334341
f"FORMAT {format_str}\n"
335342
"ENCRYPTED"
336343
f"{kms_key_id_str}"
@@ -362,6 +369,7 @@ def unload(
362369
dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
363370
chunked: Union[bool, int] = False,
364371
keep_files: bool = False,
372+
parallel: bool = True,
365373
use_threads: Union[bool, int] = True,
366374
boto3_session: Optional[boto3.Session] = None,
367375
s3_additional_kwargs: Optional[Dict[str, str]] = None,
@@ -433,6 +441,11 @@ def unload(
433441
used to encrypt data files on Amazon S3.
434442
keep_files : bool
435443
Should keep stage files?
444+
parallel: bool
445+
Whether to unload to multiple files in parallel. Defaults to True.
446+
By default, UNLOAD writes data in parallel to multiple files, according to the number of
447+
slices in the cluster. If parallel is False, UNLOAD writes to one or more data files serially,
448+
sorted absolutely according to the ORDER BY clause, if one is used.
436449
dtype_backend: str, optional
437450
Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
438451
nullable dtypes are used for all dtypes that have a nullable implementation when
@@ -487,6 +500,7 @@ def unload(
487500
max_file_size=max_file_size,
488501
kms_key_id=kms_key_id,
489502
manifest=False,
503+
parallel=parallel,
490504
boto3_session=boto3_session,
491505
)
492506
if chunked is False:

awswrangler/redshift/_read.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def unload_to_files(
100100
aws_session_token: Optional[str] = ...,
101101
region: Optional[str] = ...,
102102
unload_format: Optional[Literal["CSV", "PARQUET"]] = ...,
103+
parallel: bool = ...,
103104
max_file_size: Optional[float] = ...,
104105
kms_key_id: Optional[str] = ...,
105106
manifest: bool = ...,
@@ -121,6 +122,7 @@ def unload(
121122
dtype_backend: Literal["numpy_nullable", "pyarrow"] = ...,
122123
chunked: Literal[False] = ...,
123124
keep_files: bool = ...,
125+
parallel: bool = ...,
124126
use_threads: Union[bool, int] = ...,
125127
boto3_session: Optional[boto3.Session] = ...,
126128
s3_additional_kwargs: Optional[Dict[str, str]] = ...,
@@ -142,6 +144,7 @@ def unload(
142144
dtype_backend: Literal["numpy_nullable", "pyarrow"] = ...,
143145
chunked: Literal[True],
144146
keep_files: bool = ...,
147+
parallel: bool = ...,
145148
use_threads: Union[bool, int] = ...,
146149
boto3_session: Optional[boto3.Session] = ...,
147150
s3_additional_kwargs: Optional[Dict[str, str]] = ...,
@@ -163,6 +166,7 @@ def unload(
163166
dtype_backend: Literal["numpy_nullable", "pyarrow"] = ...,
164167
chunked: bool,
165168
keep_files: bool = ...,
169+
parallel: bool = ...,
166170
use_threads: Union[bool, int] = ...,
167171
boto3_session: Optional[boto3.Session] = ...,
168172
s3_additional_kwargs: Optional[Dict[str, str]] = ...,
@@ -184,6 +188,7 @@ def unload(
184188
dtype_backend: Literal["numpy_nullable", "pyarrow"] = ...,
185189
chunked: Union[bool, int],
186190
keep_files: bool = ...,
191+
parallel: bool = ...,
187192
use_threads: Union[bool, int] = ...,
188193
boto3_session: Optional[boto3.Session] = ...,
189194
s3_additional_kwargs: Optional[Dict[str, str]] = ...,

tests/unit/test_redshift.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,13 +649,15 @@ def test_spectrum_decimal_cast(
649649
[None, {"ServerSideEncryption": "AES256"}, {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": None}],
650650
)
651651
@pytest.mark.parametrize("use_threads", [True, False])
652+
@pytest.mark.parametrize("parallel", [True, False])
652653
def test_copy_unload_kms(
653654
path: str,
654655
redshift_table: str,
655656
redshift_con: redshift_connector.Connection,
656657
databases_parameters: Dict[str, Any],
657658
kms_key_id: str,
658659
use_threads: bool,
660+
parallel: bool,
659661
s3_additional_kwargs: Optional[Dict[str, Any]],
660662
) -> None:
661663
df = pd.DataFrame({"id": [1, 2, 3]})
@@ -678,6 +680,7 @@ def test_copy_unload_kms(
678680
iam_role=databases_parameters["redshift"]["role"],
679681
path=path,
680682
keep_files=False,
683+
parallel=parallel,
681684
use_threads=use_threads,
682685
s3_additional_kwargs=s3_additional_kwargs,
683686
)

0 commit comments

Comments
 (0)