Merge pull request #65 from awslabs/dependencies-update

igorborgest · web-flow · commit 715cea120ace · 2019-11-15T11:37:59.000-03:00
Updating all dependencies
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -8,9 +8,11 @@
 from datetime import datetime
 import ast
 
+from botocore.exceptions import ClientError, HTTPClientError  # type: ignore
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
 from pyarrow import parquet as pq  # type: ignore
+import tenacity  # type: ignore
 
 from awswrangler import data_types
 from awswrangler.exceptions import (UnsupportedWriteMode, UnsupportedFileFormat, AthenaQueryError, EmptyS3Object,
@@ -880,8 +882,18 @@ def write_csv_dataframe(dataframe, path, preserve_index, compression, fs, extra_
         csv_buffer = bytes(
             dataframe.to_csv(None, header=False, index=preserve_index, compression=compression, **csv_extra_args),
             "utf-8")
+        Pandas._write_csv_to_s3_retrying(fs=fs, path=path, buffer=csv_buffer)
+
+    @staticmethod
+    @tenacity.retry(
+        retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
+        wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
+        stop=tenacity.stop_after_attempt(max_attempt_number=15),
+        reraise=True,
+    )
+    def _write_csv_to_s3_retrying(fs: Any, path: str, buffer: bytes) -> None:
         with fs.open(path, "wb") as f:
-            f.write(csv_buffer)
+            f.write(buffer)
 
     @staticmethod
     def write_parquet_dataframe(dataframe, path, preserve_index, compression, fs, cast_columns, isolated_dataframe):
@@ -906,18 +918,29 @@ def write_parquet_dataframe(dataframe, path, preserve_index, compression, fs, ca
             for col_name, dtype in cast_columns.items():
                 col_index = table.column_names.index(col_name)
                 pyarrow_dtype = data_types.athena2pyarrow(dtype)
-                table = table.set_column(col_index, table.column(col_name).cast(pyarrow_dtype))
+                field = pa.field(name=col_name, type=pyarrow_dtype)
+                table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype))
                 logger.debug(f"Casting column {col_name} ({col_index}) to {dtype} ({pyarrow_dtype})")
 
         # Persisting on S3
-        with fs.open(path, "wb") as f:
-            pq.write_table(table, f, compression=compression, coerce_timestamps="ms", flavor="spark")
+        Pandas._write_parquet_to_s3_retrying(fs=fs, path=path, table=table, compression=compression)
 
         # Casting back on Pandas if necessary
         if isolated_dataframe is False:
             for col in casted_in_pandas:
                 dataframe[col] = dataframe[col].astype("Int64")
 
+    @staticmethod
+    @tenacity.retry(
+        retry=tenacity.retry_if_exception_type(exception_types=[ClientError, HTTPClientError]),
+        wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
+        stop=tenacity.stop_after_attempt(max_attempt_number=15),
+        reraise=True,
+    )
+    def _write_parquet_to_s3_retrying(fs: Any, path: str, table: pa.Table, compression: str) -> None:
+        with fs.open(path, "wb") as f:
+            pq.write_table(table, f, compression=compression, coerce_timestamps="ms", flavor="spark")
+
     def to_redshift(
             self,
             dataframe: pd.DataFrame,
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -2,7 +2,7 @@
 from math import ceil
 import logging
 
-from botocore.exceptions import ClientError  # type: ignore
+from botocore.exceptions import ClientError, HTTPClientError  # type: ignore
 import s3fs  # type: ignore
 import tenacity  # type: ignore
 
@@ -203,7 +203,7 @@ def list_objects(self, path):
 
     @staticmethod
     @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(exception_types=ClientError),
+        retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
         wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
         stop=tenacity.stop_after_attempt(max_attempt_number=15),
         reraise=True,
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -1,9 +1,12 @@
 from typing import List, Tuple, Dict
 import logging
+import os
 
 import pandas as pd  # type: ignore
 
-from pyspark import sql
+from pyspark.sql.functions import pandas_udf, PandasUDFType, spark_partition_id
+from pyspark.sql.types import TimestampType
+from pyspark.sql import DataFrame
 
 from awswrangler.exceptions import MissingBatchDetected, UnsupportedFileFormat
 
@@ -35,7 +38,7 @@ def _extract_casts(dtypes):
     def date2timestamp(dataframe):
         for name, dtype in dataframe.dtypes:
             if dtype == "date":
-                dataframe = dataframe.withColumn(name, dataframe[name].cast(sql.types.TimestampType()))
+                dataframe = dataframe.withColumn(name, dataframe[name].cast(TimestampType()))
                 logger.warning(f"Casting column {name} from date to timestamp!")
         return dataframe
 
@@ -93,9 +96,13 @@ def to_redshift(
         spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         session_primitives = self._session.primitives
 
-        @sql.functions.pandas_udf(returnType="objects_paths string",
-                                  functionType=sql.functions.PandasUDFType.GROUPED_MAP)
+        @pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
         def write(pandas_dataframe):
+            # Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
+            # a temporary workaround while waiting for Apache Arrow updates
+            # https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
+            os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
+
             del pandas_dataframe["aws_data_wrangler_internal_partition_id"]
             paths = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
                                                                  path=path,
@@ -106,7 +113,7 @@ def write(pandas_dataframe):
             return pd.DataFrame.from_dict({"objects_paths": paths})
 
         df_objects_paths = dataframe.repartition(numPartitions=num_partitions) \
-            .withColumn("aws_data_wrangler_internal_partition_id", sql.functions.spark_partition_id()) \
+            .withColumn("aws_data_wrangler_internal_partition_id", spark_partition_id()) \
             .groupby("aws_data_wrangler_internal_partition_id") \
             .apply(write)
 
@@ -255,7 +262,7 @@ def _flatten_struct_column(path: str, dtype: str) -> List[Tuple[str, str]]:
         return cols
 
     @staticmethod
-    def _flatten_struct_dataframe(df: sql.DataFrame, explode_outer: bool = True,
+    def _flatten_struct_dataframe(df: DataFrame, explode_outer: bool = True,
                                   explode_pos: bool = True) -> List[Tuple[str, str, str]]:
         explode: str = "EXPLODE_OUTER" if explode_outer is True else "EXPLODE"
         explode = f"POS{explode}" if explode_pos is True else explode
@@ -294,8 +301,8 @@ def _build_name(name: str, expr: str) -> str:
         return f"{name}_{suffix}".replace(".", "_")
 
     @staticmethod
-    def flatten(dataframe: sql.DataFrame, explode_outer: bool = True, explode_pos: bool = True,
-                name: str = "root") -> Dict[str, sql.DataFrame]:
+    def flatten(dataframe: DataFrame, explode_outer: bool = True, explode_pos: bool = True,
+                name: str = "root") -> Dict[str, DataFrame]:
         """
         Convert a complex nested DataFrame in one (or many) flat DataFrames
         If a columns is a struct it is flatten directly.
@@ -311,7 +318,7 @@ def flatten(dataframe: sql.DataFrame, explode_outer: bool = True, explode_pos: b
                                                                                  explode_pos=explode_pos)
         exprs_arr: List[str] = [x[2] for x in cols_exprs if Spark._is_array_or_map(x[1])]
         exprs: List[str] = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1])]
-        dfs: Dict[str, sql.DataFrame] = {name: dataframe.selectExpr(exprs)}
+        dfs: Dict[str, DataFrame] = {name: dataframe.selectExpr(exprs)}
         exprs = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1]) and not x[0].endswith("_pos")]
         for expr in exprs_arr:
             df_arr = dataframe.selectExpr(exprs + [expr])
diff --git a/building/Dockerfile b/building/Dockerfile
@@ -1,12 +1,12 @@
 FROM lambci/lambda:build-python3.6
 
-RUN pip install --upgrade pip
+RUN pip3 install --upgrade pip
 
 ADD requirements.txt /root/
-RUN pip install --upgrade -r /root/requirements.txt
+RUN pip3 install --upgrade -r /root/requirements.txt
 RUN rm -rf /root/requirements.txt
 ADD requirements-dev.txt /root/
-RUN pip install --upgrade -r /root/requirements-dev.txt
+RUN pip3 install --upgrade -r /root/requirements-dev.txt
 RUN rm -rf /root/requirements-dev.txt
 
 ENTRYPOINT ["/bin/sh"]
diff --git a/building/build-lambda-layer.sh b/building/build-lambda-layer.sh
@@ -7,7 +7,7 @@ cd ~
 # Clone desired Arrow version
 rm -rf arrow dist pyarrow*
 git clone \
-    --branch apache-arrow-0.14.1 \
+    --branch apache-arrow-0.15.1 \
     --single-branch \
     https://github.com/apache/arrow.git
 
@@ -43,6 +43,7 @@ make install
 popd
 
 # Build Pyarrow
+export ARROW_PRE_0_15_IPC_FORMAT=1
 export PYARROW_WITH_FLIGHT=0
 export PYARROW_WITH_GANDIVA=0
 export PYARROW_WITH_ORC=0
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,10 +1,10 @@
 yapf~=0.28.0
 mypy~=0.740
-flake8~=3.7.8
+flake8~=3.7.9
 pytest-cov~=2.8.1
-cfn-lint~=0.24.4
+cfn-lint~=0.25.0
 twine~=2.0.0
 wheel~=0.33.6
-sphinx~=2.2.0
+sphinx~=2.2.1
 pyspark~=2.4.4
-pyspark-stubs~=2.4.0
+pyspark-stubs~=2.4.0.post6
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
-numpy~=1.17.3
-pandas~=0.25.2
-pyarrow~=0.14.0
-botocore~=1.13.2
-boto3~=1.10.2
-s3fs~=0.3.5
-tenacity~=5.1.1
+numpy~=1.17.4
+pandas~=0.25.3
+pyarrow~=0.15.1
+botocore~=1.13.18
+boto3~=1.10.18
+s3fs~=0.4.0
+tenacity~=6.0.0
 pg8000~=1.13.2
diff --git a/setup.py b/setup.py
@@ -21,13 +21,13 @@
     packages=find_packages(include=["awswrangler", "awswrangler.*"], exclude=["tests"]),
     python_requires=">=3.6",
     install_requires=[
-        "numpy~=1.17.3",
-        "pandas~=0.25.2",
-        "pyarrow~=0.14.0",
-        "botocore~=1.12.253",
-        "boto3~=1.9.253",
-        "s3fs~=0.3.5",
-        "tenacity~=5.1.1",
+        "numpy~=1.17.4",
+        "pandas~=0.25.3",
+        "pyarrow~=0.15.1",
+        "botocore~=1.13.18",
+        "boto3~=1.10.18",
+        "s3fs~=0.4.0",
+        "tenacity~=6.0.0",
         "pg8000~=1.13.2",
     ],
 )
diff --git a/testing/Dockerfile b/testing/Dockerfile
@@ -10,8 +10,8 @@ RUN curl https://pyenv.run | bash
 RUN echo 'eval "$(pyenv init -)"' >> /root/.bashrc
 ENV PATH="/root/.pyenv/bin:$PATH"
 RUN pyenv install 3.6.8
-RUN pyenv install 3.7.3
-RUN pyenv global 3.7.3 3.6.8
+RUN pyenv install 3.7.4
+RUN pyenv global 3.7.4 3.6.8
 ENV PYSPARK_PYTHON=python
 ENV PIP=/root/.pyenv/shims/pip
 RUN $PIP install --upgrade pip
diff --git a/testing/run-tests.sh b/testing/run-tests.sh
@@ -2,9 +2,9 @@
 set -e
 
 cd ..
-pip install --upgrade -e .
 yapf --in-place --recursive setup.py awswrangler testing/test_awswrangler
 mypy awswrangler
 flake8 setup.py awswrangler testing/test_awswrangler
+pip install --upgrade -e .
 pytest --cov=awswrangler testing/test_awswrangler
 cd testing
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
diff --git a/testing/test_awswrangler/test_s3.py b/testing/test_awswrangler/test_s3.py