feat(athena): add start_query_executions for async multi-query execution

ggiallo28 · ggiallo28 · commit 17cd753aa1a6 · 2025-08-28T23:49:04.000+02:00
Introduce `wr.athena.start_query_executions` as a parallelized variant of
`start_query_execution`. It allows submitting multiple queries in one call,
with support for:

- Sequential or threaded submission (`use_threads`)
- Lazy or eager consumption of results (`as_iterator`)
- Per-query `client_request_token` (string or list)
- Optional workgroup checks (`check_workgroup`, `enforce_workgroup`)
- Full Athena cache integration

This improves performance when dispatching batches of queries by reducing
workgroup lookups and enabling concurrent execution.
diff --git a/awswrangler/athena/__init__.py b/awswrangler/athena/__init__.py
@@ -4,6 +4,7 @@
     get_query_execution,
     stop_query_execution,
     start_query_execution,
+    start_query_executions,
     wait_query,
 )
 from awswrangler.athena._spark import create_spark_session, run_spark_calculation
@@ -53,6 +54,7 @@
     "create_ctas_table",
     "show_create_table",
     "start_query_execution",
+    "start_query_executions",
     "stop_query_execution",
     "unload",
     "wait_query",
diff --git a/awswrangler/athena/_executions.py b/awswrangler/athena/_executions.py
@@ -10,12 +10,15 @@
     cast,
 )
 
+import os
 import boto3
 import botocore
 from typing_extensions import Literal
 
+from concurrent.futures import ThreadPoolExecutor
 from awswrangler import _utils, exceptions, typing
 from awswrangler._config import apply_configs
+from functools import reduce
 
 from ._cache import _CacheInfo, _check_for_cached_results
 from ._utils import (
@@ -168,6 +171,195 @@ def start_query_execution(
 
     return query_execution_id
 
+@apply_configs
+def start_query_executions(
+    sqls: list[str],
+    database: str | None = None,
+    s3_output: str | None = None,
+    workgroup: str = "primary",
+    encryption: str | None = None,
+    kms_key: str | None = None,
+    params: dict[str, Any] | list[str] | None = None,
+    paramstyle: Literal["qmark", "named"] = "named",
+    boto3_session: boto3.Session | None = None,
+    client_request_token: str | None = None,
+    athena_cache_settings: typing.AthenaCacheSettings | None = None,
+    athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY,
+    data_source: str | None = None,
+    wait: bool = False,
+    check_workgroup: bool = True,
+    enforce_workgroup: bool = False,
+    as_iterator: bool = False,
+    use_threads: bool | int = False
+) -> list[str] | list[dict[str, Any]]:
+    """
+    Start multiple SQL queries against Amazon Athena.
+
+    This function is the multi-query variant of ``start_query_execution``.  
+    It supports caching, idempotent request tokens, workgroup configuration, 
+    sequential or parallel execution, and lazy or eager iteration.
+
+    Parameters
+    ----------
+    sqls : list[str]
+        List of SQL queries to execute.
+    database : str, optional
+        AWS Glue/Athena database name.
+    s3_output : str, optional
+        S3 path where query results will be stored.
+    workgroup : str, default 'primary'
+        Athena workgroup name.
+    encryption : str, optional
+        One of {'SSE_S3', 'SSE_KMS', 'CSE_KMS'}.
+    kms_key : str, optional
+        KMS key ARN/ID, required if using KMS-based encryption.
+    params : dict or list, optional
+        Query parameters. Behavior depends on ``paramstyle``.
+    paramstyle : {'named', 'qmark'}, default 'named'
+        Parameter substitution style:
+          - 'named': ``{"name": "value"}`` and query must use ``:name``.
+          - 'qmark': list of values, substituted sequentially.
+    boto3_session : boto3.Session, optional
+        Existing boto3 session. A new session will be created if None.
+    client_request_token : str | list[str], optional
+        Idempotency token(s) for Athena:
+          - If a string: suffixed with an index to generate unique tokens.
+          - If a list: must have same length as ``sqls``.
+          - If None: no token provided (duplicate submissions possible).
+        Tokens are padded/truncated to comply with Athena’s requirement (32–128 chars).
+    athena_cache_settings : dict, optional
+        Wrangler cache settings to reuse results when possible.
+    athena_query_wait_polling_delay : float, default 1.0
+        Interval in seconds between query status checks when waiting.
+    data_source : str, optional
+        Data catalog name (default 'AwsDataCatalog').
+    wait : bool, default False
+        If True, block until queries complete and return their execution details.
+        If False, return query IDs immediately.
+    check_workgroup : bool, default True
+        If True, call GetWorkGroup once to retrieve workgroup configuration.  
+        If False, build a workgroup config from provided parameters (faster, fewer API calls).
+    enforce_workgroup : bool, default False
+        If True, mark the dummy workgroup config as "enforced" when skipping GetWorkGroup.
+    as_iterator : bool, default False
+        If True, return a lazy iterator instead of a list.
+    use_threads : bool | int, default False
+        Controls parallelism:
+          - False: submit queries sequentially.
+          - True: use ``os.cpu_count()`` worker threads.
+          - int: number of worker threads to use.
+
+    Returns
+    -------
+    list[str] | list[dict[str, Any]] | Iterator
+        - If ``wait=False``: list or iterator of query execution IDs.
+        - If ``wait=True``: list or iterator of query execution metadata dicts.
+
+    Examples
+    --------
+    Sequential, no wait:
+    >>> qids = wr.athena.start_query_executions(
+    ...     sqls=["SELECT 1", "SELECT 2"],
+    ...     database="default",
+    ...     s3_output="s3://my-bucket/results/",
+    ... )
+    >>> print(list(qids))
+    ['abc-123...', 'def-456...']
+
+    Parallel execution with 8 threads:
+    >>> qids = wr.athena.start_query_executions(
+    ...     sqls=["SELECT 1", "SELECT 2", "SELECT 3"],
+    ...     database="default",
+    ...     s3_output="s3://my-bucket/results/",
+    ...     use_threads=8,
+    ... )
+
+    Waiting for completion and retrieving metadata:
+    >>> results = wr.athena.start_query_executions(
+    ...     sqls=["SELECT 1"],
+    ...     database="default",
+    ...     s3_output="s3://my-bucket/results/",
+    ...     wait=True
+    ... )
+    >>> print(results[0]["Status"]["State"])
+    'SUCCEEDED'
+    """
+
+    session = boto3_session or boto3.Session()
+    client = session.client("athena")
+
+    if isinstance(client_request_token, list):
+        if len(client_request_token) != len(sqls):
+            raise ValueError("Length of client_request_token list must match number of queries in sqls")
+        tokens = client_request_token
+    elif isinstance(client_request_token, str):
+        tokens = [f"{client_request_token}-{i}".ljust(32, "x")[:128] for i in range(len(sqls))]
+    else:
+        tokens = [None] * len(sqls)
+
+    formatted_queries = list(map(lambda q: _apply_formatter(q, params, paramstyle), sqls))
+
+    if check_workgroup:
+        wg_config: _WorkGroupConfig = _utils._get_workgroup_config(session=session, workgroup=workgroup)
+    else:
+        wg_config = _WorkGroupConfig(
+            enforced=enforce_workgroup,
+            s3_output=s3_output,
+            encryption=encryption,
+            kms_key=kms_key,
+        )
+
+    def _submit(item):
+        (q, execution_params), token = item
+
+        if token is None and athena_cache_settings is not None:
+            cache_info = _executions._check_for_cached_results(
+                sql=q,
+                boto3_session=session,
+                workgroup=workgroup,
+                athena_cache_settings=athena_cache_settings,
+            )
+            if cache_info.has_valid_cache and cache_info.query_execution_id is not None:
+                return cache_info.query_execution_id
+
+        return _start_query_execution(
+            sql=q,
+            wg_config=wg_config,
+            database=database,
+            data_source=data_source,
+            s3_output=s3_output,
+            workgroup=workgroup,
+            encryption=encryption,
+            kms_key=kms_key,
+            execution_params=execution_params,
+            client_request_token=token,
+            boto3_session=session,
+        )
+
+    items = list(zip(formatted_queries, tokens))
+
+    if use_threads is False:
+        query_ids = map(_submit, items)
+    else:
+        max_workers = (
+            os.cpu_count() or 4 if use_threads is True else int(use_threads)
+        )
+        executor = ThreadPoolExecutor(max_workers=max_workers)
+        query_ids = executor.map(_submit, items)
+
+    if wait:
+        results_iter = map(
+            lambda qid: wait_query(
+                query_execution_id=qid,
+                boto3_session=session,
+                athena_query_wait_polling_delay=athena_query_wait_polling_delay,
+            ),
+            query_ids,
+        )
+        return results_iter if as_iterator else list(results_iter)
+
+    return query_ids if as_iterator else list(query_ids)
+
 
 def stop_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = None) -> None:
     """Stop a query execution.