feat(athena): improve start_query_executions with simplified tokens and parallel wait

ggiallo28 · ggiallo28 · commit b6e4d8815034 · 2025-08-29T00:02:42.000+02:00
- Simplified client_request_token handling:
  - Removed manual padding/truncation.
  - Let Athena enforce length constraints.
  - Tokens generated as `&lt;base_token&gt;-&lt;index&gt;` or provided as list.
- Improved wait logic:
  - Added optional wait handling directly inside _submit.
  - Queries can now be waited in parallel with submission (reduced overhead).
- Configurable default threads:
  - Replaced hardcoded defaults with os.cpu_count().
  - Added support for AWSWRANGLER_THREADS_DEFAULT env var override.
diff --git a/awswrangler/athena/_executions.py b/awswrangler/athena/_executions.py
@@ -32,6 +32,7 @@
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
+_DEFAULT_MAX_WORKERS = max(4, os.cpu_count() or 4)
 
 @apply_configs
 def start_query_execution(
@@ -179,25 +180,25 @@ def start_query_executions(
     workgroup: str = "primary",
     encryption: str | None = None,
     kms_key: str | None = None,
-    params: dict[str, Any] | list[str] | None = None,
+    params: dict[str, typing.Any] | list[str] | None = None,
     paramstyle: Literal["qmark", "named"] = "named",
     boto3_session: boto3.Session | None = None,
-    client_request_token: str | None = None,
+    client_request_token: str | list[str] | None = None,
     athena_cache_settings: typing.AthenaCacheSettings | None = None,
-    athena_query_wait_polling_delay: float = _QUERY_WAIT_POLLING_DELAY,
+    athena_query_wait_polling_delay: float = 1.0,
     data_source: str | None = None,
     wait: bool = False,
     check_workgroup: bool = True,
     enforce_workgroup: bool = False,
     as_iterator: bool = False,
-    use_threads: bool | int = False
-) -> list[str] | list[dict[str, Any]]:
+    use_threads: bool | int = False,
+) -> list[str] | list[dict[str, typing.Any]]:
     """
     Start multiple SQL queries against Amazon Athena.
 
-    This function is the multi-query variant of ``start_query_execution``.  
-    It supports caching, idempotent request tokens, workgroup configuration, 
-    sequential or parallel execution, and lazy or eager iteration.
+    Each query can optionally use Athena's result cache and idempotent request tokens.
+    Submissions can be sequential or parallel, and each query can be waited on
+    individually (inside its submission thread) if ``wait=True``.
 
     Parameters
     ----------
@@ -216,91 +217,51 @@ def start_query_executions(
     params : dict or list, optional
         Query parameters. Behavior depends on ``paramstyle``.
     paramstyle : {'named', 'qmark'}, default 'named'
-        Parameter substitution style:
-          - 'named': ``{"name": "value"}`` and query must use ``:name``.
-          - 'qmark': list of values, substituted sequentially.
+        Parameter substitution style.
     boto3_session : boto3.Session, optional
         Existing boto3 session. A new session will be created if None.
     client_request_token : str | list[str], optional
-        Idempotency token(s) for Athena:
-          - If a string: suffixed with an index to generate unique tokens.
-          - If a list: must have same length as ``sqls``.
-          - If None: no token provided (duplicate submissions possible).
-        Tokens are padded/truncated to comply with Athena’s requirement (32–128 chars).
+        Idempotency token(s). If a string, suffixed with query index.
     athena_cache_settings : dict, optional
-        Wrangler cache settings to reuse results when possible.
+        Wrangler cache settings for query result reuse.
     athena_query_wait_polling_delay : float, default 1.0
-        Interval in seconds between query status checks when waiting.
+        Interval between status checks when waiting for queries.
     data_source : str, optional
         Data catalog name (default 'AwsDataCatalog').
     wait : bool, default False
-        If True, block until queries complete and return their execution details.
-        If False, return query IDs immediately.
+        If True, block until each query completes.
     check_workgroup : bool, default True
-        If True, call GetWorkGroup once to retrieve workgroup configuration.  
-        If False, build a workgroup config from provided parameters (faster, fewer API calls).
+        If True, fetch workgroup config from Athena.
     enforce_workgroup : bool, default False
-        If True, mark the dummy workgroup config as "enforced" when skipping GetWorkGroup.
+        If True, enforce workgroup config even when skipping fetch.
     as_iterator : bool, default False
-        If True, return a lazy iterator instead of a list.
+        If True, return an iterator instead of a list.
     use_threads : bool | int, default False
-        Controls parallelism:
-          - False: submit queries sequentially.
-          - True: use ``os.cpu_count()`` worker threads.
-          - int: number of worker threads to use.
+        Parallelism:
+          - False: sequential execution
+          - True: ``os.cpu_count()`` threads
+          - int: number of worker threads
 
     Returns
     -------
-    list[str] | list[dict[str, Any]] | Iterator
-        - If ``wait=False``: list or iterator of query execution IDs.
-        - If ``wait=True``: list or iterator of query execution metadata dicts.
-
-    Examples
-    --------
-    Sequential, no wait:
-    >>> qids = wr.athena.start_query_executions(
-    ...     sqls=["SELECT 1", "SELECT 2"],
-    ...     database="default",
-    ...     s3_output="s3://my-bucket/results/",
-    ... )
-    >>> print(list(qids))
-    ['abc-123...', 'def-456...']
-
-    Parallel execution with 8 threads:
-    >>> qids = wr.athena.start_query_executions(
-    ...     sqls=["SELECT 1", "SELECT 2", "SELECT 3"],
-    ...     database="default",
-    ...     s3_output="s3://my-bucket/results/",
-    ...     use_threads=8,
-    ... )
-
-    Waiting for completion and retrieving metadata:
-    >>> results = wr.athena.start_query_executions(
-    ...     sqls=["SELECT 1"],
-    ...     database="default",
-    ...     s3_output="s3://my-bucket/results/",
-    ...     wait=True
-    ... )
-    >>> print(results[0]["Status"]["State"])
-    'SUCCEEDED'
+    list[str] | list[dict] | Iterator
+        QueryExecutionIds or execution metadata dicts if ``wait=True``.
     """
-
     session = boto3_session or boto3.Session()
-    client = session.client("athena")
 
     if isinstance(client_request_token, list):
         if len(client_request_token) != len(sqls):
             raise ValueError("Length of client_request_token list must match number of queries in sqls")
         tokens = client_request_token
     elif isinstance(client_request_token, str):
-        tokens = [f"{client_request_token}-{i}".ljust(32, "x")[:128] for i in range(len(sqls))]
+        tokens = [f"{client_request_token}-{i}" for i in range(len(sqls))]
     else:
         tokens = [None] * len(sqls)
 
     formatted_queries = list(map(lambda q: _apply_formatter(q, params, paramstyle), sqls))
 
     if check_workgroup:
-        wg_config: _WorkGroupConfig = _utils._get_workgroup_config(session=session, workgroup=workgroup)
+        wg_config: _WorkGroupConfig = _get_workgroup_config(session=session, workgroup=workgroup)
     else:
         wg_config = _WorkGroupConfig(
             enforced=enforce_workgroup,
@@ -309,20 +270,28 @@ def start_query_executions(
             kms_key=kms_key,
         )
 
-    def _submit(item):
+    def _submit(item: tuple[tuple[str, list[str] | None], str | None]):
         (q, execution_params), token = item
 
         if token is None and athena_cache_settings is not None:
-            cache_info = _executions._check_for_cached_results(
+            cache_info = _check_for_cached_results(
                 sql=q,
                 boto3_session=session,
                 workgroup=workgroup,
                 athena_cache_settings=athena_cache_settings,
             )
             if cache_info.has_valid_cache and cache_info.query_execution_id is not None:
-                return cache_info.query_execution_id
-
-        return _start_query_execution(
+                return (
+                    wait_query(
+                        query_execution_id=cache_info.query_execution_id,
+                        boto3_session=session,
+                        athena_query_wait_polling_delay=athena_query_wait_polling_delay,
+                    )
+                    if wait
+                    else cache_info.query_execution_id
+                )
+
+        qid = _start_query_execution(
             sql=q,
             wg_config=wg_config,
             database=database,
@@ -336,29 +305,25 @@ def _submit(item):
             boto3_session=session,
         )
 
+        if wait:
+            return wait_query(
+                query_execution_id=qid,
+                boto3_session=session,
+                athena_query_wait_polling_delay=athena_query_wait_polling_delay,
+            )
+
+        return qid
+
     items = list(zip(formatted_queries, tokens))
 
     if use_threads is False:
-        query_ids = map(_submit, items)
+        results = map(_submit, items)
     else:
-        max_workers = (
-            os.cpu_count() or 4 if use_threads is True else int(use_threads)
-        )
-        executor = ThreadPoolExecutor(max_workers=max_workers)
-        query_ids = executor.map(_submit, items)
-
-    if wait:
-        results_iter = map(
-            lambda qid: wait_query(
-                query_execution_id=qid,
-                boto3_session=session,
-                athena_query_wait_polling_delay=athena_query_wait_polling_delay,
-            ),
-            query_ids,
-        )
-        return results_iter if as_iterator else list(results_iter)
+        max_workers = _DEFAULT_MAX_WORKERS if use_threads is True else int(use_threads)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            results = executor.map(_submit, items)
 
-    return query_ids if as_iterator else list(query_ids)
+    return results if as_iterator else list(results)
 
 
 def stop_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = None) -> None: