Add data_source arg for athena queries. #392

igorborgest · igorborgest · commit e47cdae9fca0 · 2020-09-19T19:37:34.000-03:00
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -328,6 +328,7 @@ def _resolve_query_with_cache(
 def _resolve_query_without_cache_ctas(
     sql: str,
     database: Optional[str],
+    data_source: Optional[str],
     s3_output: Optional[str],
     keep_files: bool,
     chunksize: Union[int, bool, None],
@@ -357,6 +358,7 @@ def _resolve_query_without_cache_ctas(
             sql=sql,
             wg_config=wg_config,
             database=database,
+            data_source=data_source,
             s3_output=s3_output,
             workgroup=workgroup,
             encryption=encryption,
@@ -408,6 +410,7 @@ def _resolve_query_without_cache_ctas(
 def _resolve_query_without_cache_regular(
     sql: str,
     database: Optional[str],
+    data_source: Optional[str],
     s3_output: Optional[str],
     keep_files: bool,
     chunksize: Union[int, bool, None],
@@ -424,6 +427,7 @@ def _resolve_query_without_cache_regular(
         sql=sql,
         wg_config=wg_config,
         database=database,
+        data_source=data_source,
         s3_output=s3_output,
         workgroup=workgroup,
         encryption=encryption,
@@ -447,6 +451,7 @@ def _resolve_query_without_cache(
     # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements
     sql: str,
     database: str,
+    data_source: Optional[str],
     ctas_approach: bool,
     categories: Optional[List[str]],
     chunksize: Union[int, bool, None],
@@ -476,6 +481,7 @@ def _resolve_query_without_cache(
             return _resolve_query_without_cache_ctas(
                 sql=sql,
                 database=database,
+                data_source=data_source,
                 s3_output=_s3_output,
                 keep_files=keep_files,
                 chunksize=chunksize,
@@ -493,6 +499,7 @@ def _resolve_query_without_cache(
     return _resolve_query_without_cache_regular(
         sql=sql,
         database=database,
+        data_source=data_source,
         s3_output=_s3_output,
         keep_files=keep_files,
         chunksize=chunksize,
@@ -523,6 +530,7 @@ def read_sql_query(
     boto3_session: Optional[boto3.Session] = None,
     max_cache_seconds: int = 0,
     max_cache_query_inspections: int = 50,
+    data_source: Optional[str] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame.
 
@@ -662,6 +670,8 @@ def read_sql_query(
         Max number of queries that will be inspected from the history to try to find some result to reuse.
         The bigger the number of inspection, the bigger will be the latency for not cached queries.
         Only takes effect if max_cache_seconds > 0.
+    data_source : str, optional
+        Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default.
 
     Returns
     -------
@@ -701,6 +711,7 @@ def read_sql_query(
     return _resolve_query_without_cache(
         sql=sql,
         database=database,
+        data_source=data_source,
         ctas_approach=ctas_approach,
         categories=categories,
         chunksize=chunksize,
@@ -732,6 +743,7 @@ def read_sql_table(
     boto3_session: Optional[boto3.Session] = None,
     max_cache_seconds: int = 0,
     max_cache_query_inspections: int = 50,
+    data_source: Optional[str] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Extract the full table AWS Athena and return the results as a Pandas DataFrame.
 
@@ -868,6 +880,8 @@ def read_sql_table(
         Max number of queries that will be inspected from the history to try to find some result to reuse.
         The bigger the number of inspection, the bigger will be the latency for not cached queries.
         Only takes effect if max_cache_seconds > 0.
+    data_source : str, optional
+        Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default.
 
     Returns
     -------
@@ -885,6 +899,7 @@ def read_sql_table(
     return read_sql_query(
         sql=f'SELECT * FROM "{table}"',
         database=database,
+        data_source=data_source,
         ctas_approach=ctas_approach,
         categories=categories,
         chunksize=chunksize,
diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -52,6 +52,7 @@ def _start_query_execution(
     sql: str,
     wg_config: _WorkGroupConfig,
     database: Optional[str] = None,
+    data_source: Optional[str] = None,
     s3_output: Optional[str] = None,
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
@@ -81,6 +82,8 @@ def _start_query_execution(
     # database
     if database is not None:
         args["QueryExecutionContext"] = {"Database": database}
+        if data_source is not None:
+            args["QueryExecutionContext"]["Catalog"] = data_source
 
     # workgroup
     if workgroup is not None:
@@ -312,6 +315,7 @@ def start_query_execution(
     encryption: Optional[str] = None,
     kms_key: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
+    data_source: Optional[str] = None,
 ) -> str:
     """Start a SQL Query against AWS Athena.
 
@@ -336,6 +340,8 @@ def start_query_execution(
         For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
     boto3_session : boto3.Session(), optional
         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+    data_source : str, optional
+        Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default.
 
     Returns
     -------
@@ -344,16 +350,24 @@ def start_query_execution(
 
     Examples
     --------
+    Querying into the default data source (Amazon s3 - 'AwsDataCatalog')
+
     >>> import awswrangler as wr
     >>> query_exec_id = wr.athena.start_query_execution(sql='...', database='...')
 
+    Querying into another data source (PostgreSQL, Redshift, etc)
+
+    >>> import awswrangler as wr
+    >>> query_exec_id = wr.athena.start_query_execution(sql='...', database='...', data_source='...')
+
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     wg_config: _WorkGroupConfig = _get_workgroup_config(session=session, workgroup=workgroup)
     return _start_query_execution(
         sql=sql,
         wg_config=wg_config,
         database=database,
+        data_source=data_source,
         s3_output=s3_output,
         workgroup=workgroup,
         encryption=encryption,
diff --git a/tests/test_athena.py b/tests/test_athena.py
@@ -775,10 +775,11 @@ def test_describe_table(path, glue_database, glue_table):
     assert wr.athena.describe_table(database=glue_database, table=glue_table).shape == (1, 4)
 
 
+@pytest.mark.parametrize("data_source", [None, "AwsDataCatalog"])
 @pytest.mark.parametrize("ctas_approach", [False, True])
-def test_athena_nan_inf(glue_database, ctas_approach):
+def test_athena_nan_inf(glue_database, ctas_approach, data_source):
     sql = "SELECT nan() AS nan, infinity() as inf, -infinity() as inf_n, 1.2 as regular"
-    df = wr.athena.read_sql_query(sql, glue_database, ctas_approach)
+    df = wr.athena.read_sql_query(sql, glue_database, ctas_approach, data_source=data_source)
     print(df)
     print(df.dtypes)
     assert df.shape == (1, 4)
diff --git a/tests/test_athena_cache.py b/tests/test_athena_cache.py
@@ -2,6 +2,7 @@
 from unittest.mock import patch
 
 import pandas as pd
+import pytest
 
 import awswrangler as wr
 
@@ -35,7 +36,8 @@ def test_athena_cache(path, glue_database, glue_table, workgroup1):
     assert len(list(dfs)) == 2
 
 
-def test_cache_query_ctas_approach_true(path, glue_database, glue_table):
+@pytest.mark.parametrize("data_source", [None, "AwsDataCatalog"])
+def test_cache_query_ctas_approach_true(path, glue_database, glue_table, data_source):
     df = pd.DataFrame({"c0": [0, None]}, dtype="Int64")
     paths = wr.s3.to_parquet(
         df=df,
@@ -54,20 +56,25 @@ def test_cache_query_ctas_approach_true(path, glue_database, glue_table):
         "awswrangler.athena._read._check_for_cached_results",
         return_value=wr.athena._read._CacheInfo(has_valid_cache=False),
     ) as mocked_cache_attempt:
-        df2 = wr.athena.read_sql_table(glue_table, glue_database, ctas_approach=True, max_cache_seconds=0)
+        df2 = wr.athena.read_sql_table(
+            glue_table, glue_database, ctas_approach=True, max_cache_seconds=0, data_source=data_source
+        )
         mocked_cache_attempt.assert_called()
         assert df.shape == df2.shape
         assert df.c0.sum() == df2.c0.sum()
 
     with patch("awswrangler.athena._read._resolve_query_without_cache") as resolve_no_cache:
-        df3 = wr.athena.read_sql_table(glue_table, glue_database, ctas_approach=True, max_cache_seconds=900)
+        df3 = wr.athena.read_sql_table(
+            glue_table, glue_database, ctas_approach=True, max_cache_seconds=900, data_source=data_source
+        )
         resolve_no_cache.assert_not_called()
         assert df.shape == df3.shape
         assert df.c0.sum() == df3.c0.sum()
         ensure_athena_query_metadata(df=df3, ctas_approach=True, encrypted=False)
 
 
-def test_cache_query_ctas_approach_false(path, glue_database, glue_table):
+@pytest.mark.parametrize("data_source", [None, "AwsDataCatalog"])
+def test_cache_query_ctas_approach_false(path, glue_database, glue_table, data_source):
     df = pd.DataFrame({"c0": [0, None]}, dtype="Int64")
     paths = wr.s3.to_parquet(
         df=df,
@@ -86,13 +93,17 @@ def test_cache_query_ctas_approach_false(path, glue_database, glue_table):
         "awswrangler.athena._read._check_for_cached_results",
         return_value=wr.athena._read._CacheInfo(has_valid_cache=False),
     ) as mocked_cache_attempt:
-        df2 = wr.athena.read_sql_table(glue_table, glue_database, ctas_approach=False, max_cache_seconds=0)
+        df2 = wr.athena.read_sql_table(
+            glue_table, glue_database, ctas_approach=False, max_cache_seconds=0, data_source=data_source
+        )
         mocked_cache_attempt.assert_called()
         assert df.shape == df2.shape
         assert df.c0.sum() == df2.c0.sum()
 
     with patch("awswrangler.athena._read._resolve_query_without_cache") as resolve_no_cache:
-        df3 = wr.athena.read_sql_table(glue_table, glue_database, ctas_approach=False, max_cache_seconds=900)
+        df3 = wr.athena.read_sql_table(
+            glue_table, glue_database, ctas_approach=False, max_cache_seconds=900, data_source=data_source
+        )
         resolve_no_cache.assert_not_called()
         assert df.shape == df3.shape
         assert df.c0.sum() == df3.c0.sum()