Merge pull request #1 from igorborgest/fix_chunk_issue

Tyler Jewell · web-flow · commit 59fc0d3cb537 · 2020-04-13T10:01:51.000-05:00
Add tests for validate_schema on s3.read_parquet #167
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -1196,11 +1196,11 @@ def _read_parquet_init(
     path: Union[str, List[str]],
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     categories: List[str] = None,
+    validate_schema: bool = True,
     dataset: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
-    validate_schema: bool = True,
 ) -> pyarrow.parquet.ParquetDataset:
     """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
     if dataset is False:
@@ -1227,13 +1227,13 @@ def read_parquet(
     path: Union[str, List[str]],
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     columns: Optional[List[str]] = None,
+    validate_schema: bool = True,
     chunked: bool = False,
     dataset: bool = False,
     categories: List[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
-    validate_schema: bool = True,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths.
 
@@ -1251,7 +1251,11 @@ def read_parquet(
     filters: Union[List[Tuple], List[List[Tuple]]], optional
         List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
     columns : List[str], optional
-        Names of columns to read from the file(s)
+        Names of columns to read from the file(s).
+    validate_schema:
+        Check that individual file schemas are all the same / compatible. Schemas within a
+        folder prefix should all be the same. Disable if you have schemas that are different
+        and want to disable this check.
     chunked : bool
         If True will break the data in smaller DataFrames (Non deterministic number of lines).
         Otherwise return a single DataFrame with the whole data.
@@ -1268,10 +1272,6 @@ def read_parquet(
     s3_additional_kwargs:
         Forward to s3fs, useful for server side encryption
         https://s3fs.readthedocs.io/en/latest/#serverside-encryption
-    validate_schema:
-        Check that individual file schemas are all the same / compatible. Schemas within a
-        folder prefix should all be the same. Disable if you have schemas that are different
-        and want to disable this check.
 
     Returns
     -------
@@ -1320,7 +1320,9 @@ def read_parquet(
         validate_schema=validate_schema,
     )
     if chunked is False:
-        return _read_parquet(data=data, columns=columns, categories=categories, use_threads=use_threads)
+        return _read_parquet(
+            data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema
+        )
     return _read_parquet_chunked(data=data, columns=columns, categories=categories, use_threads=use_threads)
 
 
@@ -1329,14 +1331,16 @@ def _read_parquet(
     columns: Optional[List[str]] = None,
     categories: List[str] = None,
     use_threads: bool = True,
+    validate_schema: bool = True,
 ) -> pd.DataFrame:
     tables: List[pa.Table] = []
     for piece in data.pieces:
         table: pa.Table = piece.read(
             columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
         )
         tables.append(table)
-    table = pa.lib.concat_tables(tables)
+    promote: bool = not validate_schema
+    table = pa.lib.concat_tables(tables, promote=promote)
     return table.to_pandas(
         use_threads=use_threads,
         split_blocks=True,
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
@@ -664,3 +664,30 @@ def test_category(bucket, database):
         ensure_data_types_category(df2)
     wr.s3.delete_objects(path=paths)
     assert wr.catalog.delete_table_if_exists(database=database, table="test_category") is True
+
+
+def test_parquet_validate_schema(bucket, database):
+    path = f"s3://{bucket}/test_parquet_file_validate/"
+    wr.s3.delete_objects(path=path)
+
+    df = pd.DataFrame({"id": [1, 2, 3]})
+    path_file = f"s3://{bucket}/test_parquet_file_validate/0.parquet"
+    wr.s3.to_parquet(df=df, path=path_file)
+    wr.s3.wait_objects_exist(paths=[path_file])
+
+    df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]})
+    path_file2 = f"s3://{bucket}/test_parquet_file_validate/1.parquet"
+    wr.s3.to_parquet(df=df2, path=path_file2)
+    wr.s3.wait_objects_exist(paths=[path_file2])
+
+    df3 = wr.s3.read_parquet(path=path, validate_schema=False)
+    assert len(df3.index) == 6
+    assert len(df3.columns) == 3
+
+    with pytest.raises(ValueError):
+        wr.s3.read_parquet(path=path, validate_schema=True)
+
+    with pytest.raises(ValueError):
+        wr.s3.store_parquet_metadata(path=path, database=database, table="test_parquet_validate_schema", dataset=True)
+
+    wr.s3.delete_objects(path=path)