Remove unnecessary schema inference. #524

igorborgest · igorborgest · commit 2fb23f6cda7a · 2021-01-29T14:59:39.000-03:00
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -301,18 +301,19 @@ def _read_parquet_chunked(
             )
             if pq_file is None:
                 continue
-            schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema(
-                schema=pq_file.schema.to_arrow_schema(), partitions=None
-            )[0]
-            if validate_schema is True and last_schema is not None:
-                if schema != last_schema:
-                    raise exceptions.InvalidSchemaConvergence(
-                        f"Was detect at least 2 different schemas:\n"
-                        f"    - {last_path} -> {last_schema}\n"
-                        f"    - {path} -> {schema}"
-                    )
-            last_schema = schema
-            last_path = path
+            if validate_schema is True:
+                schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema(
+                    schema=pq_file.schema.to_arrow_schema(), partitions=None
+                )[0]
+                if last_schema is not None:
+                    if schema != last_schema:
+                        raise exceptions.InvalidSchemaConvergence(
+                            f"Was detect at least 2 different schemas:\n"
+                            f"    - {last_path} -> {last_schema}\n"
+                            f"    - {path} -> {schema}"
+                        )
+                last_schema = schema
+                last_path = path
             num_row_groups: int = pq_file.num_row_groups
             _logger.debug("num_row_groups: %s", num_row_groups)
             for i in range(num_row_groups):
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -428,3 +428,29 @@ def test_empty_file(path, use_threads):
     df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
     df2["par"] = df2["par"].astype("string")
     assert df.equals(df2)
+
+
+def test_read_chunked(path):
+    path = f"{path}file.parquet"
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [None, None, None]})
+    wr.s3.to_parquet(df, path)
+    df2 = next(wr.s3.read_parquet(path, chunked=True))
+    assert df.shape == df2.shape
+
+
+def test_read_chunked_validation_exception(path):
+    path = f"{path}file.parquet"
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [None, None, None]})
+    wr.s3.to_parquet(df, path)
+    with pytest.raises(wr.exceptions.UndetectedType):
+        next(wr.s3.read_parquet(path, chunked=True, validate_schema=True))
+
+
+def test_read_chunked_validation_exception2(path):
+    df = pd.DataFrame({"c0": [0, 1, 2]})
+    wr.s3.to_parquet(df, f"{path}file0.parquet")
+    df = pd.DataFrame({"c1": [0, 1, 2]})
+    wr.s3.to_parquet(df, f"{path}file1.parquet")
+    with pytest.raises(wr.exceptions.InvalidSchemaConvergence):
+        for _ in wr.s3.read_parquet(path, dataset=True, chunked=True, validate_schema=True):
+            pass