Merge pull request #168 from jewelltp/fix_chunk_issue

igorborgest · web-flow · commit 12d4ba1d42c5 · 2020-04-13T12:38:13.000-03:00
added validate_schema parameter to read_parquet
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -1196,6 +1196,7 @@ def _read_parquet_init(
     path: Union[str, List[str]],
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     categories: List[str] = None,
+    validate_schema: bool = True,
     dataset: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
@@ -1212,7 +1213,12 @@ def _read_parquet_init(
     fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
     cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
     data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
-        path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters, read_dictionary=categories
+        path_or_paths=path_or_paths,
+        filesystem=fs,
+        metadata_nthreads=cpus,
+        filters=filters,
+        read_dictionary=categories,
+        validate_schema=validate_schema,
     )
     return data
 
@@ -1221,6 +1227,7 @@ def read_parquet(
     path: Union[str, List[str]],
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     columns: Optional[List[str]] = None,
+    validate_schema: bool = True,
     chunked: bool = False,
     dataset: bool = False,
     categories: List[str] = None,
@@ -1244,7 +1251,11 @@ def read_parquet(
     filters: Union[List[Tuple], List[List[Tuple]]], optional
         List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
     columns : List[str], optional
-        Names of columns to read from the file(s)
+        Names of columns to read from the file(s).
+    validate_schema:
+        Check that individual file schemas are all the same / compatible. Schemas within a
+        folder prefix should all be the same. Disable if you have schemas that are different
+        and want to disable this check.
     chunked : bool
         If True will break the data in smaller DataFrames (Non deterministic number of lines).
         Otherwise return a single DataFrame with the whole data.
@@ -1306,9 +1317,12 @@ def read_parquet(
         use_threads=use_threads,
         boto3_session=boto3_session,
         s3_additional_kwargs=s3_additional_kwargs,
+        validate_schema=validate_schema,
     )
     if chunked is False:
-        return _read_parquet(data=data, columns=columns, categories=categories, use_threads=use_threads)
+        return _read_parquet(
+            data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema
+        )
     return _read_parquet_chunked(data=data, columns=columns, categories=categories, use_threads=use_threads)
 
 
@@ -1317,14 +1331,16 @@ def _read_parquet(
     columns: Optional[List[str]] = None,
     categories: List[str] = None,
     use_threads: bool = True,
+    validate_schema: bool = True,
 ) -> pd.DataFrame:
     tables: List[pa.Table] = []
     for piece in data.pieces:
         table: pa.Table = piece.read(
             columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
         )
         tables.append(table)
-    table = pa.lib.concat_tables(tables)
+    promote: bool = not validate_schema
+    table = pa.lib.concat_tables(tables, promote=promote)
     return table.to_pandas(
         use_threads=use_threads,
         split_blocks=True,
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
@@ -664,3 +664,30 @@ def test_category(bucket, database):
         ensure_data_types_category(df2)
     wr.s3.delete_objects(path=paths)
     assert wr.catalog.delete_table_if_exists(database=database, table="test_category") is True
+
+
+def test_parquet_validate_schema(bucket, database):
+    path = f"s3://{bucket}/test_parquet_file_validate/"
+    wr.s3.delete_objects(path=path)
+
+    df = pd.DataFrame({"id": [1, 2, 3]})
+    path_file = f"s3://{bucket}/test_parquet_file_validate/0.parquet"
+    wr.s3.to_parquet(df=df, path=path_file)
+    wr.s3.wait_objects_exist(paths=[path_file])
+
+    df2 = pd.DataFrame({"id2": [1, 2, 3], "val": ["foo", "boo", "bar"]})
+    path_file2 = f"s3://{bucket}/test_parquet_file_validate/1.parquet"
+    wr.s3.to_parquet(df=df2, path=path_file2)
+    wr.s3.wait_objects_exist(paths=[path_file2])
+
+    df3 = wr.s3.read_parquet(path=path, validate_schema=False)
+    assert len(df3.index) == 6
+    assert len(df3.columns) == 3
+
+    with pytest.raises(ValueError):
+        wr.s3.read_parquet(path=path, validate_schema=True)
+
+    with pytest.raises(ValueError):
+        wr.s3.store_parquet_metadata(path=path, database=database, table="test_parquet_validate_schema", dataset=True)
+
+    wr.s3.delete_objects(path=path)