Merge branch 'main' into feature/iceberg-merge-into-full-functionality

kukushking · web-flow · commit 579d4285d942 · 2025-10-21T18:21:47.000+01:00
diff --git a/.github/workflows/cfn-nag.yml b/.github/workflows/cfn-nag.yml
@@ -26,7 +26,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
       - name: Use Node.js
-        uses: actions/setup-node@v5
+        uses: actions/setup-node@v6
         with:
           node-version: 18
       - name: Cache Node.js modules
@@ -49,7 +49,7 @@ jobs:
       - name: Rust latest
         run: rustup update
       - name: Install uv
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
       - name: Set up cdk.json
diff --git a/.github/workflows/minimal-tests.yml b/.github/workflows/minimal-tests.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Rust latest
         run: rustup update
       - name: Install uv
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
       - name: Install Requirements
diff --git a/.github/workflows/snyk.yml b/.github/workflows/snyk.yml
@@ -23,6 +23,6 @@ jobs:
         with:
           args: --severity-threshold=high --sarif-file-output=snyk.sarif
       - name: Upload result to GitHub Code Scanning
-        uses: github/codeql-action/upload-sarif@v3
+        uses: github/codeql-action/upload-sarif@v4
         with:
           sarif_file: snyk.sarif
diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
@@ -29,7 +29,7 @@ jobs:
       - name: Rust latest
         run: rustup update
       - name: Install uv
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
       - name: Install Requirements
diff --git a/awswrangler/athena/_executions.py b/awswrangler/athena/_executions.py
@@ -40,6 +40,7 @@ def start_query_execution(
     kms_key: str | None = None,
     params: dict[str, Any] | list[str] | None = None,
     paramstyle: Literal["qmark", "named"] = "named",
+    result_reuse_configuration: dict[str, Any] | None = None,
     boto3_session: boto3.Session | None = None,
     client_request_token: str | None = None,
     athena_cache_settings: typing.AthenaCacheSettings | None = None,
@@ -87,6 +88,9 @@ def start_query_execution(
 
         - ``named``
         - ``qmark``
+    result_reuse_configuration
+        A structure that contains the configuration settings for reusing query results.
+        See also: https://docs.aws.amazon.com/athena/latest/ug/reusing-query-results.html
     boto3_session
         The default boto3 session will be used if **boto3_session** receive ``None``.
     client_request_token
@@ -156,6 +160,7 @@ def start_query_execution(
             encryption=encryption,
             kms_key=kms_key,
             execution_params=execution_params,
+            result_reuse_configuration=result_reuse_configuration,
             client_request_token=client_request_token,
             boto3_session=boto3_session,
         )
diff --git a/awswrangler/athena/_executions.pyi b/awswrangler/athena/_executions.pyi
@@ -18,6 +18,7 @@ def start_query_execution(
     kms_key: str | None = ...,
     params: dict[str, Any] | list[str] | None = ...,
     paramstyle: Literal["qmark", "named"] = ...,
+    result_reuse_configuration: dict[str, Any] | None = ...,
     boto3_session: boto3.Session | None = ...,
     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
     athena_query_wait_polling_delay: float = ...,
@@ -35,6 +36,7 @@ def start_query_execution(
     kms_key: str | None = ...,
     params: dict[str, Any] | list[str] | None = ...,
     paramstyle: Literal["qmark", "named"] = ...,
+    result_reuse_configuration: dict[str, Any] | None = ...,
     boto3_session: boto3.Session | None = ...,
     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
     athena_query_wait_polling_delay: float = ...,
@@ -52,6 +54,7 @@ def start_query_execution(
     kms_key: str | None = ...,
     params: dict[str, Any] | list[str] | None = ...,
     paramstyle: Literal["qmark", "named"] = ...,
+    result_reuse_configuration: dict[str, Any] | None = ...,
     boto3_session: boto3.Session | None = ...,
     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
     athena_query_wait_polling_delay: float = ...,
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -427,6 +427,7 @@ def _resolve_query_without_cache_regular(
     s3_additional_kwargs: dict[str, Any] | None,
     boto3_session: boto3.Session | None,
     execution_params: list[str] | None = None,
+    result_reuse_configuration: dict[str, Any] | None = None,
     dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
     client_request_token: str | None = None,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]:
@@ -444,6 +445,7 @@ def _resolve_query_without_cache_regular(
         encryption=encryption,
         kms_key=kms_key,
         execution_params=execution_params,
+        result_reuse_configuration=result_reuse_configuration,
         client_request_token=client_request_token,
         boto3_session=boto3_session,
     )
@@ -467,7 +469,7 @@ def _resolve_query_without_cache_regular(
     )
 
 
-def _resolve_query_without_cache(
+def _resolve_query_without_cache(  # noqa: PLR0913
     sql: str,
     database: str,
     data_source: str | None,
@@ -491,6 +493,7 @@ def _resolve_query_without_cache(
     boto3_session: boto3.Session | None,
     pyarrow_additional_kwargs: dict[str, Any] | None = None,
     execution_params: list[str] | None = None,
+    result_reuse_configuration: dict[str, Any] | None = None,
     dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
     client_request_token: str | None = None,
 ) -> pd.DataFrame | Iterator[pd.DataFrame]:
@@ -572,6 +575,7 @@ def _resolve_query_without_cache(
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
         execution_params=execution_params,
+        result_reuse_configuration=result_reuse_configuration,
         dtype_backend=dtype_backend,
         client_request_token=client_request_token,
     )
@@ -785,6 +789,7 @@ def read_sql_query(
     athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY,
     params: dict[str, Any] | list[str] | None = None,
     paramstyle: Literal["qmark", "named"] = "named",
+    result_reuse_configuration: dict[str, Any] | None = None,
     dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
     s3_additional_kwargs: dict[str, Any] | None = None,
     pyarrow_additional_kwargs: dict[str, Any] | None = None,
@@ -980,6 +985,10 @@ def read_sql_query(
 
         - ``named``
         - ``qmark``
+    result_reuse_configuration
+        A structure that contains the configuration settings for reusing query results.
+        This parameter is only valid when both `ctas_approach` and `unload_approach` are set to `False`.
+        See also: https://docs.aws.amazon.com/athena/latest/ug/reusing-query-results.html
     dtype_backend
         Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
         nullable dtypes are used for all dtypes that have a nullable implementation when
@@ -1040,6 +1049,10 @@ def read_sql_query(
         raise exceptions.InvalidArgumentCombination(
             "Using `client_request_token` is only allowed when `ctas_approach=False` and `unload_approach=False`."
         )
+    if result_reuse_configuration and (ctas_approach or unload_approach):
+        raise exceptions.InvalidArgumentCombination(
+            "Using `result_reuse_configuration` is only allowed when `ctas_approach=False` and `unload_approach=False`."
+        )
     chunksize = sys.maxsize if ctas_approach is False and chunksize is True else chunksize
 
     # Substitute query parameters if applicable
@@ -1104,6 +1117,7 @@ def read_sql_query(
         boto3_session=boto3_session,
         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
         execution_params=execution_params,
+        result_reuse_configuration=result_reuse_configuration,
         dtype_backend=dtype_backend,
         client_request_token=client_request_token,
     )
diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -93,6 +93,7 @@ def _start_query_execution(
     encryption: str | None = None,
     kms_key: str | None = None,
     execution_params: list[str] | None = None,
+    result_reuse_configuration: dict[str, Any] | None = None,
     client_request_token: str | None = None,
     boto3_session: boto3.Session | None = None,
 ) -> str:
@@ -130,6 +131,9 @@ def _start_query_execution(
     if execution_params:
         args["ExecutionParameters"] = execution_params
 
+    if result_reuse_configuration:
+        args["ResultReuseConfiguration"] = result_reuse_configuration
+
     client_athena = _utils.client(service_name="athena", session=boto3_session)
     _logger.debug("Starting query execution with args: \n%s", pprint.pformat(args))
     response = _utils.try_it(
diff --git a/awswrangler/mysql.py b/awswrangler/mysql.py
@@ -93,7 +93,7 @@ def connect(
     write_timeout: int | None = None,
     connect_timeout: int = 10,
     cursorclass: type["Cursor"] | None = None,
-) -> "pymysql.connections.Connection":  # type: ignore[type-arg]
+) -> "pymysql.connections.Connection":
     """Return a pymysql connection from a Glue Catalog Connection or Secrets Manager.
 
     https://pymysql.readthedocs.io
@@ -231,7 +231,7 @@ def read_sql_query(
 @_utils.check_optional_dependency(pymysql, "pymysql")
 def read_sql_query(
     sql: str,
-    con: "pymysql.connections.Connection",  # type: ignore[type-arg]
+    con: "pymysql.connections.Connection",
     index_col: str | list[str] | None = None,
     params: list[Any] | tuple[Any, ...] | dict[Any, Any] | None = None,
     chunksize: int | None = None,
@@ -351,7 +351,7 @@ def read_sql_table(
 @_utils.check_optional_dependency(pymysql, "pymysql")
 def read_sql_table(
     table: str,
-    con: "pymysql.connections.Connection",  # type: ignore[type-arg]
+    con: "pymysql.connections.Connection",
     schema: str | None = None,
     index_col: str | list[str] | None = None,
     params: list[Any] | tuple[Any, ...] | dict[Any, Any] | None = None,
@@ -439,7 +439,7 @@ def read_sql_table(
 @apply_configs
 def to_sql(
     df: pd.DataFrame,
-    con: "pymysql.connections.Connection",  # type: ignore[type-arg]
+    con: "pymysql.connections.Connection",
     table: str,
     schema: str,
     mode: _ToSqlModeLiteral = "append",
diff --git a/awswrangler/redshift/_read.py b/awswrangler/redshift/_read.py
@@ -241,6 +241,7 @@ def unload_to_files(
     kms_key_id: str | None = None,
     manifest: bool = False,
     partition_cols: list[str] | None = None,
+    cleanpath: bool = False,
     boto3_session: boto3.Session | None = None,
 ) -> None:
     """Unload Parquet files on s3 from a Redshift query result (Through the UNLOAD command).
@@ -294,6 +295,21 @@ def unload_to_files(
         Unload a manifest file on S3.
     partition_cols
         Specifies the partition keys for the unload operation.
+    cleanpath
+        Use CLEANPATH instead of ALLOWOVERWRITE. When True, uses CLEANPATH to remove existing files
+        located in the Amazon S3 path before unloading files. When False (default), uses ALLOWOVERWRITE
+        to overwrite existing files, including the manifest file. These options are mutually exclusive.
+
+        ALLOWOVERWRITE: By default, UNLOAD fails if it finds files that it would possibly overwrite.
+        If ALLOWOVERWRITE is specified, UNLOAD overwrites existing files, including the manifest file.
+
+        CLEANPATH: Removes existing files located in the Amazon S3 path specified in the TO clause
+        before unloading files to the specified location. If you include the PARTITION BY clause,
+        existing files are removed only from the partition folders to receive new files generated
+        by the UNLOAD operation. You must have the s3:DeleteObject permission on the Amazon S3 bucket.
+        Files removed using CLEANPATH are permanently deleted and can't be recovered.
+
+        For more information, see: https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html
     boto3_session
         The default boto3 session will be used if **boto3_session** is ``None``.
 
@@ -307,6 +323,15 @@ def unload_to_files(
     ...         con=con,
     ...         iam_role="arn:aws:iam::XXX:role/XXX"
     ...     )
+    >>> # Using CLEANPATH instead of ALLOWOVERWRITE
+    >>> with wr.redshift.connect("MY_GLUE_CONNECTION") as con:
+    ...     wr.redshift.unload_to_files(
+    ...         sql="SELECT * FROM public.mytable",
+    ...         path="s3://bucket/extracted_parquet_files/",
+    ...         con=con,
+    ...         iam_role="arn:aws:iam::XXX:role/XXX",
+    ...         cleanpath=True
+    ...     )
 
 
     """
@@ -339,11 +364,13 @@ def unload_to_files(
         # Escape quotation marks in SQL
         sql = sql.replace("'", "''")
 
+        overwrite_str: str = "CLEANPATH" if cleanpath else "ALLOWOVERWRITE"
+
         unload_sql = (
             f"UNLOAD ('{sql}')\n"
             f"TO '{path}'\n"
             f"{auth_str}"
-            "ALLOWOVERWRITE\n"
+            f"{overwrite_str}\n"
             f"{parallel_str}\n"
             f"FORMAT {format_str}\n"
             "ENCRYPTED"
@@ -376,6 +403,7 @@ def unload(
     chunked: bool | int = False,
     keep_files: bool = False,
     parallel: bool = True,
+    cleanpath: bool = False,
     use_threads: bool | int = True,
     boto3_session: boto3.Session | None = None,
     s3_additional_kwargs: dict[str, str] | None = None,
@@ -452,6 +480,21 @@ def unload(
         By default, UNLOAD writes data in parallel to multiple files, according to the number of
         slices in the cluster. If parallel is False, UNLOAD writes to one or more data files serially,
         sorted absolutely according to the ORDER BY clause, if one is used.
+    cleanpath
+        Use CLEANPATH instead of ALLOWOVERWRITE. When True, uses CLEANPATH to remove existing files
+        located in the Amazon S3 path before unloading files. When False (default), uses ALLOWOVERWRITE
+        to overwrite existing files, including the manifest file. These options are mutually exclusive.
+
+        ALLOWOVERWRITE: By default, UNLOAD fails if it finds files that it would possibly overwrite.
+        If ALLOWOVERWRITE is specified, UNLOAD overwrites existing files, including the manifest file.
+
+        CLEANPATH: Removes existing files located in the Amazon S3 path specified in the TO clause
+        before unloading files to the specified location. If you include the PARTITION BY clause,
+        existing files are removed only from the partition folders to receive new files generated
+        by the UNLOAD operation. You must have the s3:DeleteObject permission on the Amazon S3 bucket.
+        Files removed using CLEANPATH are permanently deleted and can't be recovered.
+
+        For more information, see: https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html
     dtype_backend
         Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
         nullable dtypes are used for all dtypes that have a nullable implementation when
@@ -489,6 +532,15 @@ def unload(
     ...         con=con,
     ...         iam_role="arn:aws:iam::XXX:role/XXX"
     ...     )
+    >>> # Using CLEANPATH instead of ALLOWOVERWRITE
+    >>> with wr.redshift.connect("MY_GLUE_CONNECTION") as con:
+    ...     df = wr.redshift.unload(
+    ...         sql="SELECT * FROM public.mytable",
+    ...         path="s3://bucket/extracted_parquet_files/",
+    ...         con=con,
+    ...         iam_role="arn:aws:iam::XXX:role/XXX",
+    ...         cleanpath=True
+    ...     )
 
     """
     path = path if path.endswith("/") else f"{path}/"
@@ -505,6 +557,7 @@ def unload(
         kms_key_id=kms_key_id,
         manifest=False,
         parallel=parallel,
+        cleanpath=cleanpath,
         boto3_session=boto3_session,
     )
     if chunked is False:
diff --git a/building/lambda/build-lambda-layer.sh b/building/lambda/build-lambda-layer.sh
@@ -11,7 +11,7 @@ popd
 rm -rf dist arrow
 
 export ARROW_HOME=$(pwd)/dist
-export ARROW_VERSION=20.0.0
+export ARROW_VERSION=21.0.0
 export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
 export CMAKE_PREFIX_PATH=$ARROW_HOME:$CMAKE_PREFIX_PATH
 export SETUPTOOLS_SCM_PRETEND_VERSION=$ARROW_VERSION
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "botocore>=1.23.32,<2",
     "pandas>=1.2.0,<3.0.0",
     "numpy>=1.26,<3.0",
-    "pyarrow>=8.0.0,<21.0.0",
+    "pyarrow>=8.0.0,<22.0.0",
     "typing-extensions>=4.4.0,<5",
     "packaging>=21.1,<26.0",
     "setuptools ; python_version >= '3.12'",
@@ -53,9 +53,9 @@ opensearch = [
 ]
 openpyxl = ["openpyxl>=3.0.0,<4"]
 progressbar = ["progressbar2>=4.0.0,<5"]
-deltalake = ["deltalake>=0.18.0,<1.2.0"]
+deltalake = ["deltalake>=0.18.0,<1.3.0"]
 geopandas = ["geopandas>=1.0.0,<2"]
-modin = ["modin>=0.31,<0.36"]
+modin = ["modin>=0.31,<0.38"]
 ray = ["ray[default, data]>=2.49.0,<3"]
 
 [project.urls]
@@ -72,7 +72,7 @@ dev = [
     "boto3-stubs[athena, cleanrooms, chime, cloudwatch, dynamodb, ec2, emr, emr-serverless, glue, kms, logs, neptune, opensearch, opensearchserverless, quicksight, rds, rds-data, redshift, redshift-data, s3, secretsmanager, ssm, sts, timestream-query, timestream-write]>=1.36.2,<2",
     "doc8~=1.1",
     "mypy~=1.14",
-    "ruff>=0.9.2,<0.13.0",
+    "ruff>=0.9.2,<0.15.0",
     "moto~=5.0",
     "openpyxl~=3.1",
     "pyparsing>=3.2.1,<4",
@@ -83,7 +83,7 @@ dev = [
     "pytest-xdist>=3.6.1,<4",
     "s3fs==0.4.2",
     "tox>=4.23.2,<5",
-    "tox-uv==1.28.0",
+    "tox-uv==1.28.1",
     "bump-my-version>=0.29,<1.3",
     "IPython>=8.18.1,<9",
     "jupyterlab~=4.3",
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
diff --git a/tests/unit/test_redshift.py b/tests/unit/test_redshift.py
diff --git a/uv.lock b/uv.lock