Fix Athena ctas_approach issue with immutability. #335

igorborgest · igorborgest · commit e578a241b399 · 2020-07-30T10:16:57.000-03:00
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
-[![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Coverage](https://img.shields.io/badge/coverage-93%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=master)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)
 
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -257,12 +257,12 @@ def list_sampling(lst: List[Any], sampling: float) -> List[Any]:
 
 def ensure_df_is_mutable(df: pd.DataFrame) -> pd.DataFrame:
     """Ensure that all columns has the writeable flag True."""
-    columns: List[str] = df.columns.to_list()
-    for column in columns:
+    for column in df.columns.to_list():
         if hasattr(df[column].values, "flags") is True:
             if df[column].values.flags.writeable is False:
-                df = df.copy(deep=True)
-                break
+                s: pd.Series = df[column]
+                df[column] = None
+                df[column] = s
     return df
 
 
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -169,7 +169,7 @@ def _arrowtable2df(
     path: str,
     path_root: Optional[str],
 ) -> pd.DataFrame:
-    return _apply_partitions(
+    df: pd.DataFrame = _apply_partitions(
         df=table.to_pandas(
             use_threads=use_threads,
             split_blocks=True,
@@ -185,6 +185,7 @@ def _arrowtable2df(
         path=path,
         path_root=path_root,
     )
+    return _utils.ensure_df_is_mutable(df=df)
 
 
 def _read_parquet_chunked(
@@ -254,7 +255,7 @@ def _read_parquet_chunked(
         yield next_slice
 
 
-def _read_parquet_file_single_thread(
+def _read_parquet_file(
     path: str,
     columns: Optional[List[str]],
     categories: Optional[List[str]],
@@ -285,7 +286,7 @@ def _count_row_groups(
         return pq_file.num_row_groups
 
 
-def _read_parquet_file_multi_thread(
+def _read_parquet_row_group(
     row_group: int,
     path: str,
     columns: Optional[List[str]],
@@ -306,7 +307,7 @@ def _read_parquet_file_multi_thread(
         return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
 
 
-def _read_parquet_file(
+def _read_parquet(
     path: str,
     columns: Optional[List[str]],
     categories: Optional[List[str]],
@@ -318,7 +319,7 @@ def _read_parquet_file(
     use_threads: bool,
 ) -> pd.DataFrame:
     if use_threads is False:
-        table: pa.Table = _read_parquet_file_single_thread(
+        table: pa.Table = _read_parquet_file(
             path=path,
             columns=columns,
             categories=categories,
@@ -333,7 +334,7 @@ def _read_parquet_file(
         with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
             tables: Tuple[pa.Table, ...] = tuple(
                 executor.map(
-                    _read_parquet_file_multi_thread,
+                    _read_parquet_row_group,
                     range(num_row_groups),
                     itertools.repeat(path),
                     itertools.repeat(columns),
@@ -529,7 +530,7 @@ def read_parquet(
     if chunked is not False:
         return _read_parquet_chunked(paths=paths, chunked=chunked, validate_schema=validate_schema, **args)
     if len(paths) == 1:
-        return _read_parquet_file(path=paths[0], **args)
+        return _read_parquet(path=paths[0], **args)
     if validate_schema is True:
         _validate_schemas_from_files(
             paths=paths,
@@ -540,8 +541,8 @@ def read_parquet(
         )
     if use_threads is True:
         args["use_threads"] = True
-        return _read_concurrent(func=_read_parquet_file, ignore_index=True, paths=paths, **args)
-    return _union(dfs=[_read_parquet_file(path=p, **args) for p in paths], ignore_index=True)
+        return _read_concurrent(func=_read_parquet, ignore_index=True, paths=paths, **args)
+    return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=True)
 
 
 @apply_configs
diff --git a/test.sh b/test.sh
@@ -7,7 +7,7 @@ microtime() {
 
 START=$(microtime)
 
-./validation.sh
+./validate.sh
 tox -e ALL
 coverage html --directory coverage
 rm -rf .coverage* Running
diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py
@@ -424,3 +424,11 @@ def test_read_parquet_filter_partitions(path, glue_table, glue_database, use_thr
         assert df2.c0.iloc[0] == i
         assert df2.c1.iloc[0] == i
         assert df2.c2.iloc[0] == i
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_read_parquet_mutability(path, glue_table, glue_database, use_threads):
+    sql = "SELECT timestamp '2012-08-08 01:00' AS c0"
+    df = wr.athena.read_sql_query(sql, "default", use_threads=use_threads)
+    df["c0"] = df["c0"] + pd.DateOffset(months=-2)
+    assert df.c0[0].value == 1339117200000000000
diff --git a/validate.sh b/validate.sh