Defining ctas_approach False by default

igorborgest · igorborgest · commit ed87ba62a259 · 2019-12-10T21:45:38.000-03:00
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -499,17 +499,17 @@ def read_sql_athena(self,
                         workgroup: Optional[str] = None,
                         encryption: Optional[str] = None,
                         kms_key: Optional[str] = None,
-                        ctas_approach: bool = True,
+                        ctas_approach: bool = False,
                         procs_cpu_bound: Optional[int] = None,
                         max_result_size: Optional[int] = None):
         """
         Executes any SQL query on AWS Athena and return a Dataframe of the result.
         There are two approaches to be defined through ctas_approach parameter:
-        1 - ctas_approach True (Default):
+        1 - ctas_approach True (For Huge results):
             Wrap the query with a CTAS and then reads the table data as parquet directly from s3.
             PROS: Faster and has a better handle of nested types
-            CONS: Can't use max_result_size.
-        2 - ctas_approach False:
+            CONS: Can't use max_result_size and must have create and drop table permissions
+        2 - ctas_approach False (Default):
             Does a regular query on Athena and parse the regular CSV result on s3
             PROS: Accepts max_result_size.
             CONS: Slower (But stills faster than other libraries that uses the Athena API) and does not handle nested types so well
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -434,7 +434,7 @@ def test_etl_complex_ctas(session, bucket, database):
                               mode="overwrite",
                               procs_cpu_bound=1)
     sleep(1)
-    df = session.pandas.read_sql_athena(sql="select * from test", database=database)
+    df = session.pandas.read_sql_athena(ctas_approach=True, sql="select * from test", database=database)
     for row in df.itertuples():
         assert isinstance(row.my_timestamp, datetime)
         assert isinstance(row.my_date, date)
@@ -731,7 +731,7 @@ def test_to_parquet_with_cast_null(
     assert len(list(dataframe.columns)) == len(list(dataframe2.columns))
 
 
-def test_read_sql_athena_with_time_zone(session, bucket, database):
+def test_read_sql_athena_with_time_zone(session, database):
     query = "select current_timestamp as value, typeof(current_timestamp) as type"
     dataframe = session.pandas.read_sql_athena(ctas_approach=False, sql=query, database=database)
     assert len(dataframe.index) == 1
@@ -1507,6 +1507,6 @@ def test_read_sql_athena_ctas(session, bucket, database):
                               preserve_index=False,
                               procs_cpu_bound=4,
                               partition_cols=["partition"])
-    df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
+    df2 = session.pandas.read_sql_athena(ctas_approach=True, sql="select * from test", database=database)
     assert len(list(df.columns)) == len(list(df2.columns))
     assert len(df.index) == len(df2.index)