Fixing general athena cache bugs.

igorborgest · igorborgest · commit b952eb78054e · 2020-06-25T23:52:58.000-03:00
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -439,9 +439,13 @@ def cast_pandas_with_athena_types(df: pd.DataFrame, dtype: Dict[str, str]) -> pd
                 )
             elif pandas_type == "string":
                 curr_type: str = str(df[col].dtypes)
-                if curr_type.startswith("int") or curr_type.startswith("float"):
+                print(curr_type)
+                if (curr_type.lower().startswith("int") is True) or (curr_type.startswith("float") is True):
                     df[col] = df[col].astype(str).astype("string")
+                elif curr_type.startswith("object") is True:
+                    df[col] = df[col].astype(str)
                 else:
+                    print(col)
                     df[col] = df[col].astype("string")
             else:
                 try:
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -506,6 +506,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals,too-man
         max_cache_seconds=max_cache_seconds,
         max_cache_query_inspections=max_cache_query_inspections,
     )
+    _logger.debug("cache_info: %s", cache_info)
 
     if cache_info["has_valid_cache"] is True:
         _logger.debug("Valid cache found. Retrieving...")
@@ -687,6 +688,7 @@ def _resolve_query_with_cache(  # pylint: disable=too-many-return-statements
     session: Optional[boto3.Session],
 ):
     """Fetch cached data and return it as a pandas Dataframe (or list of Dataframes)."""
+    _logger.debug("cache_info: %s", cache_info)
     if cache_info["data_type"] == "parquet":
         manifest_path = cache_info["query_execution_info"]["Statistics"]["DataManifestLocation"]
         # this is needed just so we can access boto's modeled exceptions
@@ -970,7 +972,9 @@ def _prepare_query_string_for_comparison(query_string: str) -> str:
     """To use cached data, we need to compare queries. Returns a query string in canonical form."""
     # for now this is a simple complete strip, but it could grow into much more sophisticated
     # query comparison data structures
-    return "".join(query_string.split()).strip("()").lower()
+    query_string = "".join(query_string.split()).strip("()").lower()
+    query_string = query_string[:-1] if query_string.endswith(";") is True else query_string
+    return query_string
 
 
 def _get_last_query_executions(
@@ -983,6 +987,7 @@ def _get_last_query_executions(
         args["WorkGroup"] = workgroup
     paginator = client_athena.get_paginator("list_query_executions")
     for page in paginator.paginate(**args):
+        _logger.debug("paginating Athena's queries history...")
         query_execution_id_list: List[str] = page["QueryExecutionIds"]
         execution_data = client_athena.batch_get_query_execution(QueryExecutionIds=query_execution_id_list)
         yield execution_data.get("QueryExecutions")
@@ -1026,33 +1031,45 @@ def _check_for_cached_results(
     num_executions_inspected: int = 0
     if max_cache_seconds > 0:  # pylint: disable=too-many-nested-blocks
         current_timestamp = datetime.datetime.now(datetime.timezone.utc)
-        print(current_timestamp)
         for query_executions in _get_last_query_executions(boto3_session=session, workgroup=workgroup):
+
+            _logger.debug("len(query_executions): %s", len(query_executions))
             cached_queries: List[Dict[str, Any]] = _sort_successful_executions_data(query_executions=query_executions)
             comparable_sql: str = _prepare_query_string_for_comparison(sql)
+            _logger.debug("len(cached_queries): %s", len(cached_queries))
 
             # this could be mapreduced, but it is only 50 items long, tops
             for query_info in cached_queries:
-                if (current_timestamp - query_info["Status"]["CompletionDateTime"]).total_seconds() > max_cache_seconds:
-                    break  # pragma: no cover
+
+                query_timestamp: datetime.datetime = query_info["Status"]["CompletionDateTime"]
+                _logger.debug("current_timestamp: %s", current_timestamp)
+                _logger.debug("query_timestamp: %s", query_timestamp)
+                if (current_timestamp - query_timestamp).total_seconds() > max_cache_seconds:
+                    return {"has_valid_cache": False}  # pragma: no cover
 
                 comparison_query: Optional[str]
                 if query_info["StatementType"] == "DDL" and query_info["Query"].startswith("CREATE TABLE"):
                     parsed_query: Optional[str] = _parse_select_query_from_possible_ctas(query_info["Query"])
                     if parsed_query is not None:
                         comparison_query = _prepare_query_string_for_comparison(query_string=parsed_query)
+                        _logger.debug("DDL - comparison_query: %s", comparison_query)
+                        _logger.debug("DDL - comparable_sql: %s", comparable_sql)
                         if comparison_query == comparable_sql:
                             data_type = "parquet"
                             return {"has_valid_cache": True, "data_type": data_type, "query_execution_info": query_info}
 
                 elif query_info["StatementType"] == "DML" and not query_info["Query"].startswith("INSERT"):
                     comparison_query = _prepare_query_string_for_comparison(query_string=query_info["Query"])
+                    _logger.debug("DML - comparison_query: %s", comparison_query)
+                    _logger.debug("DML - comparable_sql: %s", comparable_sql)
                     if comparison_query == comparable_sql:
                         data_type = "csv"
                         return {"has_valid_cache": True, "data_type": data_type, "query_execution_info": query_info}
 
                 num_executions_inspected += 1
+                _logger.debug("num_executions_inspected: %s", num_executions_inspected)
+                _logger.debug("max_cache_query_inspections: %s", max_cache_query_inspections)
                 if num_executions_inspected >= max_cache_query_inspections:
-                    break  # pragma: no cover
+                    return {"has_valid_cache": False}  # pragma: no cover
 
     return {"has_valid_cache": False}
diff --git a/tests/test_s3_athena.py b/tests/test_s3_athena.py
@@ -2058,6 +2058,33 @@ def test_cache_query_ctas_approach_false(path, database, table):
         assert df.c0.sum() == df3.c0.sum()
 
 
+def test_cache_query_semicolon(path, database, table):
+    df = pd.DataFrame({"c0": [0, None]}, dtype="Int64")
+    paths = wr.s3.to_parquet(
+        df=df,
+        path=path,
+        dataset=True,
+        mode="overwrite",
+        database=database,
+        table=table,
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths)
+
+    with patch(
+        "awswrangler.athena._check_for_cached_results", return_value={"has_valid_cache": False}
+    ) as mocked_cache_attempt:
+        df2 = wr.athena.read_sql_query(f"SELECT * FROM {table}", database=database, ctas_approach=True, max_cache_seconds=0)
+        mocked_cache_attempt.assert_called()
+        assert df.shape == df2.shape
+        assert df.c0.sum() == df2.c0.sum()
+
+    with patch("awswrangler.athena._resolve_query_without_cache") as resolve_no_cache:
+        df3 = wr.athena.read_sql_query(f"SELECT * FROM {table};", database=database, ctas_approach=True, max_cache_seconds=900)
+        resolve_no_cache.assert_not_called()
+        assert df.shape == df3.shape
+        assert df.c0.sum() == df3.c0.sum()
+
+
 @pytest.mark.parametrize("partition_cols", [None, ["c2"], ["c1", "c2"]])
 def test_metadata_partitions_dataset(path, partition_cols):
     df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2": [6, 7, 8]})
@@ -2483,3 +2510,42 @@ def test_sanitize_columns(path, sanitize_columns, col):
     assert len(df.index) == 3
     assert len(df.columns) == 1
     assert df.columns == [col]
+
+
+def test_parquet_catalog_casting_to_string(path, table, database):
+    paths = wr.s3.to_parquet(
+        df=get_df_cast(),
+        path=path,
+        index=False,
+        dataset=True,
+        mode="overwrite",
+        database=database,
+        table=table,
+        dtype={
+            "iint8": "string",
+            "iint16": "string",
+            "iint32": "string",
+            "iint64": "string",
+            "float": "string",
+            "double": "double",
+            "decimal": "string",
+            "string": "string",
+            "date": "string",
+            "timestamp": "string",
+            "bool": "string",
+            "binary": "string",
+            "category": "string",
+            "par0": "string",
+            "par1": "string",
+        },
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths)
+    df = wr.s3.read_parquet(path=path)
+    assert len(df.index) == 3
+    assert len(df.columns) == 15
+    df = wr.athena.read_sql_table(table=table, database=database, ctas_approach=True)
+    assert len(df.index) == 3
+    assert len(df.columns) == 15
+    df = wr.athena.read_sql_table(table=table, database=database, ctas_approach=False)
+    assert len(df.index) == 3
+    assert len(df.columns) == 15