Merge branch 'main' into release-3.0.0

LeonLuttenberger · LeonLuttenberger · commit 5e2f96e6499e · 2022-10-21T15:45:18.000-05:00
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -414,6 +414,12 @@ def table_refs_to_df(tables: List[pa.Table], kwargs: Dict[str, Any]) -> pd.DataF
     return _table_to_df(pa.concat_tables(tables, promote=True), kwargs=kwargs)
 
 
+@engine.dispatch_on_engine
+def is_pandas_frame(obj: Any) -> bool:
+    """Checks if the passed objected is a Pandas DataFrame"""
+    return isinstance(obj, pd.DataFrame)
+
+
 def list_to_arrow_table(
     mapping: List[Dict[str, Any]],
     schema: Optional[pa.Schema] = None,
diff --git a/awswrangler/athena/__init__.py b/awswrangler/athena/__init__.py
@@ -9,7 +9,9 @@
     get_named_query_statement,
     get_query_columns_types,
     get_query_execution,
+    get_query_executions,
     get_work_group,
+    list_query_executions,
     repair_table,
     show_create_table,
     start_query_execution,
@@ -24,10 +26,12 @@
     "describe_table",
     "get_query_columns_types",
     "get_query_execution",
+    "get_query_executions",
     "get_query_results",
     "get_named_query_statement",
     "get_work_group",
     "generate_create_query",
+    "list_query_executions",
     "repair_table",
     "create_ctas_table",
     "show_create_table",
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -105,16 +105,23 @@ def _fetch_parquet_result(
     if not paths:
         if not temp_table_fqn:
             raise exceptions.EmptyDataFrame("Query would return untyped, empty dataframe.")
+
         database, temp_table_name = map(lambda x: x.replace('"', ""), temp_table_fqn.split("."))
         dtype_dict = catalog.get_table_types(database=database, table=temp_table_name, boto3_session=boto3_session)
         df = pd.DataFrame(columns=list(dtype_dict.keys()))
         df = cast_pandas_with_athena_types(df=df, dtype=dtype_dict)
         df = _apply_query_metadata(df=df, query_metadata=query_metadata)
+
+        if chunked:
+            return (df,)
+
         return df
+
     if not pyarrow_additional_kwargs:
         pyarrow_additional_kwargs = {}
         if categories:
             pyarrow_additional_kwargs["categories"] = categories
+
     ret = s3.read_parquet(
         path=paths,
         use_threads=use_threads,
diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -1146,3 +1146,104 @@ def get_query_execution(query_execution_id: str, boto3_session: Optional[boto3.S
         QueryExecutionId=query_execution_id,
     )
     return cast(Dict[str, Any], response["QueryExecution"])
+
+
+def get_query_executions(
+    query_execution_ids: List[str], return_unprocessed: bool = False, boto3_session: Optional[boto3.Session] = None
+) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
+    """From specified query execution IDs, return a DataFrame of query execution details.
+
+    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.batch_get_query_execution
+
+    Parameters
+    ----------
+    query_execution_ids : List[str]
+        Athena query execution IDs.
+    return_unprocessed: bool.
+        True to also return query executions id that are unable to be processed.
+        False to only return DataFrame of query execution details.
+        Default is False
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame contain information about query execution details.
+
+    DataFrame
+        DataFrame contain information about unprocessed query execution ids.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> query_executions_df, unprocessed_query_executions_df = wr.athena.get_query_executions(
+            query_execution_ids=['query-execution-id','query-execution-id1']
+        )
+    """
+    chunked_size: int = 50
+    query_executions: List[Dict[str, Any]] = []
+    unprocessed_query_execution: List[Dict[str, str]] = []
+    client_athena: boto3.client = _utils.client(service_name="athena", session=boto3_session)
+    for i in range(0, len(query_execution_ids), chunked_size):
+        response = client_athena.batch_get_query_execution(QueryExecutionIds=query_execution_ids[i : i + chunked_size])
+        query_executions += response["QueryExecutions"]
+        unprocessed_query_execution += response["UnprocessedQueryExecutionIds"]
+    if unprocessed_query_execution and not return_unprocessed:
+        _logger.warning(
+            "Some of query execution ids are unable to be processed."
+            "Set return_unprocessed to True to get unprocessed query execution ids"
+        )
+    if return_unprocessed:
+        return pd.json_normalize(query_executions), pd.json_normalize(unprocessed_query_execution)
+    return pd.json_normalize(query_executions)
+
+
+def list_query_executions(workgroup: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+    """Fetch list query execution IDs ran in specified workgroup or primary work group if not specified.
+
+    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.list_query_executions
+
+    Parameters
+    ----------
+    workgroup : str
+        The name of the workgroup from which the query_id are being returned.
+        If not specified, a list of available query execution IDs for the queries in the primary workgroup is returned.
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    List[str]
+        List of query execution IDs.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> res = wr.athena.list_query_executions(workgroup='workgroup-name')
+
+    """
+    client_athena: boto3.client = _utils.client(service_name="athena", session=boto3_session)
+    kwargs: Dict[str, Any] = {"base": 1}
+    if workgroup:
+        kwargs["WorkGroup"] = workgroup
+    query_list: List[str] = []
+    response: Dict[str, Any] = _utils.try_it(
+        f=client_athena.list_query_executions,
+        ex=botocore.exceptions.ClientError,
+        ex_code="ThrottlingException",
+        max_num_tries=5,
+        **kwargs,
+    )
+    query_list += response["QueryExecutionIds"]
+    while "NextToken" in response:
+        kwargs["NextToken"] = response["NextToken"]
+        response = _utils.try_it(
+            f=client_athena.list_query_executions,
+            ex=botocore.exceptions.ClientError,
+            ex_code="ThrottlingException",
+            max_num_tries=5,
+            **kwargs,
+        )
+        query_list += response["QueryExecutionIds"]
+    return query_list
diff --git a/awswrangler/distributed/ray/_register.py b/awswrangler/distributed/ray/_register.py
@@ -2,7 +2,7 @@
 # pylint: disable=import-outside-toplevel
 from awswrangler._data_types import pyarrow_types_from_pandas
 from awswrangler._distributed import MemoryFormatEnum, engine, memory_format
-from awswrangler._utils import table_refs_to_df
+from awswrangler._utils import is_pandas_frame, table_refs_to_df
 from awswrangler.distributed.ray._core import ray_remote
 from awswrangler.lakeformation._read import _get_work_unit_results
 from awswrangler.s3._delete import _delete_objects
@@ -30,7 +30,7 @@ def register_ray() -> None:
     if memory_format.get() == MemoryFormatEnum.MODIN:
         from awswrangler.distributed.ray.modin._core import modin_repartition
         from awswrangler.distributed.ray.modin._data_types import pyarrow_types_from_pandas_distributed
-        from awswrangler.distributed.ray.modin._utils import _arrow_refs_to_df
+        from awswrangler.distributed.ray.modin._utils import _arrow_refs_to_df, _is_pandas_or_modin_frame
         from awswrangler.distributed.ray.modin.s3._read_parquet import _read_parquet_distributed
         from awswrangler.distributed.ray.modin.s3._read_text import _read_text_distributed
         from awswrangler.distributed.ray.modin.s3._write_dataset import (
@@ -52,5 +52,6 @@ def register_ray() -> None:
             to_json: modin_repartition(to_json),
             to_parquet: modin_repartition(to_parquet),
             table_refs_to_df: _arrow_refs_to_df,
+            is_pandas_frame: _is_pandas_or_modin_frame,
         }.items():
             engine.register_func(o_f, d_f)  # type: ignore
diff --git a/awswrangler/distributed/ray/modin/_utils.py b/awswrangler/distributed/ray/modin/_utils.py
@@ -47,6 +47,10 @@ def _arrow_refs_to_df(arrow_refs: List[Callable[..., Any]], kwargs: Optional[Dic
     return _to_modin(dataset=ray.data.from_arrow_refs(arrow_refs), to_pandas_kwargs=kwargs)
 
 
+def _is_pandas_or_modin_frame(obj: Any) -> bool:
+    return isinstance(obj, (pd.DataFrame, modin_pd.DataFrame))
+
+
 @dataclass
 class ParamConfig:
     """
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -729,7 +729,7 @@ def read_parquet_table(
     partial_cast_function = functools.partial(
         _data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res)
     )
-    if isinstance(df, pd.DataFrame):
+    if _utils.is_pandas_frame(df):
         return partial_cast_function(df)
     # df is a generator, so map is needed for casting dtypes
     return map(partial_cast_function, df)
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -119,9 +119,11 @@ Amazon Athena
     generate_create_query
     get_query_columns_types
     get_query_execution
+    get_query_executions
     get_query_results
     get_named_query_statement
     get_work_group
+    list_query_executions
     read_sql_query
     read_sql_table
     repair_table
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
@@ -679,6 +679,19 @@ def test_read_sql_query_wo_results(path, glue_database, glue_table):
     ensure_athena_query_metadata(df=df, ctas_approach=False, encrypted=False)
 
 
+@pytest.mark.parametrize("ctas_approach", [False, True])
+def test_read_sql_query_wo_results_chunked(path, glue_database, glue_table, ctas_approach):
+    wr.catalog.create_parquet_table(database=glue_database, table=glue_table, path=path, columns_types={"c0": "int"})
+    sql = f"SELECT * FROM {glue_database}.{glue_table}"
+
+    counter = 0
+    for df in wr.athena.read_sql_query(sql, database=glue_database, ctas_approach=ctas_approach, chunksize=100):
+        assert df.empty
+        counter += 1
+
+    assert counter == 1
+
+
 @pytest.mark.xfail()
 def test_read_sql_query_wo_results_ctas(path, glue_database, glue_table):
     wr.catalog.create_parquet_table(database=glue_database, table=glue_table, path=path, columns_types={"c0": "int"})
@@ -1304,3 +1317,22 @@ def test_athena_generate_create_query(path, glue_database, glue_table):
     )
     wr.athena.start_query_execution(sql=query, database=glue_database, wait=True)
     assert query == wr.athena.generate_create_query(database=glue_database, table=glue_table)
+
+
+def test_get_query_execution(workgroup0, workgroup1):
+    query_execution_ids = wr.athena.list_query_executions(workgroup=workgroup0) + wr.athena.list_query_executions(
+        workgroup=workgroup1
+    )
+    assert query_execution_ids
+    query_execution_detail = wr.athena.get_query_execution(query_execution_id=query_execution_ids[0])
+    query_executions_df = wr.athena.get_query_executions(query_execution_ids)
+    assert isinstance(query_executions_df, pd.DataFrame)
+    assert isinstance(query_execution_detail, dict)
+    assert set(query_execution_ids).intersection(set(query_executions_df["QueryExecutionId"].values.tolist()))
+    query_execution_ids1 = query_execution_ids + ["aaa", "bbb"]
+    query_executions_df, unprocessed_query_executions_df = wr.athena.get_query_executions(
+        query_execution_ids1, return_unprocessed=True
+    )
+    assert isinstance(unprocessed_query_executions_df, pd.DataFrame)
+    assert set(query_execution_ids).intersection(set(query_executions_df["QueryExecutionId"].values.tolist()))
+    assert {"aaa", "bbb"}.intersection(set(unprocessed_query_executions_df["QueryExecutionId"].values.tolist()))
diff --git a/tutorials/006 - Amazon Athena.ipynb b/tutorials/006 - Amazon Athena.ipynb
@@ -143,7 +143,7 @@
     "    mode=\"overwrite\",\n",
     "    database=\"awswrangler_test\",\n",
     "    table=\"noaa\"\n",
-    ");"
+    ")"
    ]
   },
   {
diff --git a/tutorials/007 - Redshift, MySQL, PostgreSQL, SQL Server, Oracle.ipynb b/tutorials/007 - Redshift, MySQL, PostgreSQL, SQL Server, Oracle.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "# 7 - Redshift, MySQL, PostgreSQL, SQL Server and Oracle\n",
     "\n",
-    "[awswrangler](https://github.com/aws/aws-sdk-pandas)'s Redshift, MySQL and PostgreSQL have two basic function in common that tries to follow the Pandas conventions, but add more data type consistency.\n",
+    "[awswrangler](https://github.com/aws/aws-sdk-pandas)'s Redshift, MySQL and PostgreSQL have two basic functions in common that try to follow Pandas conventions, but add more data type consistency.\n",
     "\n",
     "- [wr.redshift.to_sql()](https://aws-sdk-pandas.readthedocs.io/en/3.0.0b3/stubs/awswrangler.redshift.to_sql.html)\n",
     "- [wr.redshift.read_sql_query()](https://aws-sdk-pandas.readthedocs.io/en/3.0.0b3/stubs/awswrangler.redshift.read_sql_query.html)\n",
diff --git a/tutorials/014 - Schema Evolution.ipynb b/tutorials/014 - Schema Evolution.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "# 14 - Schema Evolution\n",
     "\n",
-    "awswrangler support new **columns** on Parquet and CSV datasets through:\n",
+    "awswrangler supports new **columns** on Parquet and CSV datasets through:\n",
     "\n",
     "- [wr.s3.to_parquet()](https://aws-sdk-pandas.readthedocs.io/en/3.0.0b3/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet)\n",
     "- [wr.s3.store_parquet_metadata()](https://aws-sdk-pandas.readthedocs.io/en/3.0.0b3/stubs/awswrangler.s3.store_parquet_metadata.html#awswrangler.s3.store_parquet_metadata) i.e. \"Crawler\"\n",
diff --git a/tutorials/015 - EMR.ipynb b/tutorials/015 - EMR.ipynb
@@ -160,13 +160,6 @@
    "source": [
     "wr.emr.terminate_cluster(cluster_id)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorials/016 - EMR & Docker.ipynb b/tutorials/016 - EMR & Docker.ipynb
@@ -201,7 +201,7 @@
     "print(f\"awswrangler version: {wr.__version__}\")\n",
     "\"\"\"\n",
     "\n",
-    "boto3.client(\"s3\").put_object(Body=script, Bucket=bucket, Key=\"test_docker.py\");"
+    "boto3.client(\"s3\").put_object(Body=script, Bucket=bucket, Key=\"test_docker.py\")"
    ]
   },
   {
@@ -329,13 +329,6 @@
     "\n",
     "wr.emr.terminate_cluster(cluster_id)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorials/017 - Partition Projection.ipynb b/tutorials/017 - Partition Projection.ipynb
@@ -159,7 +159,7 @@
     "        \"month\": \"1,12\",\n",
     "        \"day\": \"1,31\"\n",
     "    },\n",
-    ");"
+    ")"
    ]
   },
   {
@@ -334,7 +334,7 @@
     "    projection_values={\n",
     "        \"city\": \"São Paulo,Tokio,Seattle\"\n",
     "    },\n",
-    ");"
+    ")"
    ]
   },
   {
@@ -511,7 +511,7 @@
     "        \"dt\": \"2020-01-01,2020-01-03\",\n",
     "        \"ts\": \"2020-01-01 00:00:00,2020-01-01 00:00:02\"\n",
     "    },\n",
-    ");"
+    ")"
    ]
   },
   {
@@ -679,7 +679,7 @@
     "    projection_types={\n",
     "        \"uuid\": \"injected\",\n",
     "    }\n",
-    ");"
+    ")"
    ]
   },
   {
diff --git a/tutorials/018 - QuickSight.ipynb b/tutorials/018 - QuickSight.ipynb
@@ -16,7 +16,7 @@
     "* [Exploring the public AWS COVID-19 data lake](https://aws.amazon.com/blogs/big-data/exploring-the-public-aws-covid-19-data-lake/)\n",
     "* [CloudFormation template](https://covid19-lake.s3.us-east-2.amazonaws.com/cfn/CovidLakeStack.template.json)\n",
     "\n",
-    "*Please, install the Cloudformation template above to have access to the public data lake.*\n",
+    "*Please, install the CloudFormation template above to have access to the public data lake.*\n",
     "\n",
     "*P.S. To be able to access the public data lake, you must allow explicitly QuickSight to access the related external bucket.*"
    ]
diff --git a/tutorials/019 - Athena Cache.ipynb b/tutorials/019 - Athena Cache.ipynb
@@ -8,13 +8,13 @@
     "\n",
     "# 19 - Amazon Athena Cache\n",
     "\n",
-    "[awswrangler](https://github.com/aws/aws-sdk-pandas) has a cache strategy that is disabled by default and can be enabled passing `max_cache_seconds` biggier than 0. This cache strategy for Amazon Athena can help you to **decrease query times and costs**.\n",
+    "[awswrangler](https://github.com/aws/aws-sdk-pandas) has a cache strategy that is disabled by default and can be enabled by passing `max_cache_seconds` bigger than 0. This cache strategy for Amazon Athena can help you to **decrease query times and costs**.\n",
     "\n",
     "When calling `read_sql_query`, instead of just running the query, we now can verify if the query has been run before. If so, and this last run was within `max_cache_seconds` (a new parameter to `read_sql_query`), we return the same results as last time if they are still available in S3. We have seen this increase performance more than 100x, but the potential is pretty much infinite.\n",
     "\n",
     "The detailed approach is:\n",
     "- When `read_sql_query` is called with `max_cache_seconds > 0` (it defaults to 0), we check for the last queries run by the same workgroup (the most we can get without pagination).\n",
-    "- By default it will check the last 50 queries, but you can customize it throught the `max_cache_query_inspections` argument.\n",
+    "- By default it will check the last 50 queries, but you can customize it through the `max_cache_query_inspections` argument.\n",
     "- We then sort those queries based on CompletionDateTime, descending\n",
     "- For each of those queries, we check if their CompletionDateTime is still within the `max_cache_seconds` window. If so, we check if the query string is the same as now (with some smart heuristics to guarantee coverage over both `ctas_approach`es). If they are the same, we check if the last one's results are still on S3, and then return them instead of re-running the query.\n",
     "- During the whole cache resolution phase, if there is anything wrong, the logic falls back to the usual `read_sql_query` path.\n",
@@ -292,7 +292,7 @@
     "    mode=\"overwrite\",\n",
     "    database=\"awswrangler_test\",\n",
     "    table=\"noaa\"\n",
-    ");"
+    ")"
    ]
   },
   {
diff --git a/tutorials/020 - Spark Table Interoperability.ipynb b/tutorials/020 - Spark Table Interoperability.ipynb
diff --git a/tutorials/022 - Writing Partitions Concurrently.ipynb b/tutorials/022 - Writing Partitions Concurrently.ipynb
diff --git a/tutorials/025 - Redshift - Loading Parquet files with Spectrum.ipynb b/tutorials/025 - Redshift - Loading Parquet files with Spectrum.ipynb
diff --git a/tutorials/026 - Amazon Timestream.ipynb b/tutorials/026 - Amazon Timestream.ipynb
diff --git a/tutorials/027 - Amazon Timestream 2.ipynb b/tutorials/027 - Amazon Timestream 2.ipynb
diff --git a/tutorials/033 - Amazon Neptune.ipynb b/tutorials/033 - Amazon Neptune.ipynb

Original file line number	Diff line number	Diff line change
`@@ -729,7 +729,7 @@ def read_parquet_table(`
`729`	`729`	`partial_cast_function = functools.partial(`
`730`	`730`	`_data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res)`
`731`	`731`	`)`
`732`		`- if isinstance(df, pd.DataFrame):`
	`732`	`+ if _utils.is_pandas_frame(df):`
`733`	`733`	`return partial_cast_function(df)`
`734`	`734`	`# df is a generator, so map is needed for casting dtypes`
`735`	`735`	`return map(partial_cast_function, df)`
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@`
`143`	`143`	`" mode=\"overwrite\",\n",`
`144`	`144`	`" database=\"awswrangler_test\",\n",`
`145`	`145`	`" table=\"noaa\"\n",`
`146`		`- ");"`
	`146`	`+ ")"`
`147`	`147`	`]`
`148`	`148`	`},`
`149`	`149`	`{`