Reducing I/O parallelism for some specific operations

igorborgest · igorborgest · commit 7f0b4b62e372 · 2019-11-18T22:24:48.000-03:00
diff --git a/README.md b/README.md
@@ -208,7 +208,7 @@ session.spark.create_glue_table(dataframe=dataframe,
 ```py3
 session = awswrangler.Session(spark_session=spark)
 dfs = session.spark.flatten(dataframe=df_nested)
-for name, df_flat in dfs:
+for name, df_flat in dfs.items():
     print(name)
     df_flat.show()
 ```
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -885,12 +885,11 @@ def write_csv_dataframe(dataframe, path, preserve_index, compression, fs, extra_
         Pandas._write_csv_to_s3_retrying(fs=fs, path=path, buffer=csv_buffer)
 
     @staticmethod
-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
-        wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
-        stop=tenacity.stop_after_attempt(max_attempt_number=15),
-        reraise=True,
-    )
+    @tenacity.retry(retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
+                    wait=tenacity.wait_random_exponential(multiplier=0.5),
+                    stop=tenacity.stop_after_attempt(max_attempt_number=10),
+                    reraise=True,
+                    after=tenacity.after_log(logger, logging.INFO))
     def _write_csv_to_s3_retrying(fs: Any, path: str, buffer: bytes) -> None:
         with fs.open(path, "wb") as f:
             f.write(buffer)
@@ -931,12 +930,11 @@ def write_parquet_dataframe(dataframe, path, preserve_index, compression, fs, ca
                 dataframe[col] = dataframe[col].astype("Int64")
 
     @staticmethod
-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(exception_types=[ClientError, HTTPClientError]),
-        wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
-        stop=tenacity.stop_after_attempt(max_attempt_number=15),
-        reraise=True,
-    )
+    @tenacity.retry(retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
+                    wait=tenacity.wait_random_exponential(multiplier=0.5),
+                    stop=tenacity.stop_after_attempt(max_attempt_number=10),
+                    reraise=True,
+                    after=tenacity.after_log(logger, logging.INFO))
     def _write_parquet_to_s3_retrying(fs: Any, path: str, table: pa.Table, compression: str) -> None:
         with fs.open(path, "wb") as f:
             pq.write_table(table, f, compression=compression, coerce_timestamps="ms", flavor="spark")
@@ -1066,5 +1064,7 @@ def drop_duplicated_columns(dataframe: pd.DataFrame, inplace: bool = True) -> pd
         if inplace is False:
             dataframe = dataframe.copy(deep=True)
         duplicated_cols = dataframe.columns.duplicated()
-        logger.warning(f"Dropping repeated columns: {list(dataframe.columns[duplicated_cols])}")
+        duplicated_cols_names = list(dataframe.columns[duplicated_cols])
+        if len(duplicated_cols_names) > 0:
+            logger.warning(f"Dropping repeated columns: {duplicated_cols_names}")
         return dataframe.loc[:, ~duplicated_cols]
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -1,3 +1,4 @@
+from typing import Dict, List, Union, Optional
 import json
 import logging
 
@@ -116,14 +117,25 @@ def get_connection(self, glue_connection):
         conn = self.generate_connection(database=database, host=host, port=int(port), user=user, password=password)
         return conn
 
-    def write_load_manifest(self, manifest_path, objects_paths):
-        objects_sizes = self._session.s3.get_objects_sizes(objects_paths=objects_paths)
-        manifest = {"entries": []}
+    def write_load_manifest(self, manifest_path: str, objects_paths: List[str], procs_io_bound: Optional[int] = None
+                            ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
+        objects_sizes: Dict[str, int] = self._session.s3.get_objects_sizes(objects_paths=objects_paths,
+                                                                           procs_io_bound=procs_io_bound)
+        manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {"entries": []}
+        path: str
+        size: int
         for path, size in objects_sizes.items():
-            entry = {"url": path, "mandatory": True, "meta": {"content_length": size}}
-            manifest.get("entries").append(entry)
-        payload = json.dumps(manifest)
+            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
+                "url": path,
+                "mandatory": True,
+                "meta": {
+                    "content_length": size
+                }
+            }
+            manifest["entries"].append(entry)
+        payload: str = json.dumps(manifest)
         client_s3 = self._session.boto3_session.client(service_name="s3", config=self._session.botocore_config)
+        bucket: str
         bucket, path = manifest_path.replace("s3://", "").split("/", 1)
         client_s3.put_object(Body=payload, Bucket=bucket, Key=path)
         return manifest
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -1,3 +1,4 @@
+from typing import Dict, List, Optional
 import multiprocessing as mp
 from math import ceil
 import logging
@@ -202,12 +203,11 @@ def list_objects(self, path):
         return keys
 
     @staticmethod
-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
-        wait=tenacity.wait_random_exponential(multiplier=0.5, max=10),
-        stop=tenacity.stop_after_attempt(max_attempt_number=15),
-        reraise=True,
-    )
+    @tenacity.retry(retry=tenacity.retry_if_exception_type(exception_types=(ClientError, HTTPClientError)),
+                    wait=tenacity.wait_random_exponential(multiplier=0.5),
+                    stop=tenacity.stop_after_attempt(max_attempt_number=10),
+                    reraise=True,
+                    after=tenacity.after_log(logger, logging.INFO))
     def head_object_with_retry(client, bucket, key):
         return client.head_object(Bucket=bucket, Key=key)
 
@@ -226,11 +226,11 @@ def _get_objects_head_remote(send_pipe, session_primitives, objects_paths):
         send_pipe.send(objects_sizes)
         send_pipe.close()
 
-    def get_objects_sizes(self, objects_paths, procs_io_bound=None):
+    def get_objects_sizes(self, objects_paths: List[str], procs_io_bound: Optional[int] = None) -> Dict[str, int]:
         if not procs_io_bound:
             procs_io_bound = self._session.procs_io_bound
         logger.debug(f"procs_io_bound: {procs_io_bound}")
-        objects_sizes = {}
+        objects_sizes: Dict[str, int] = {}
         procs = []
         receive_pipes = []
         bounders = calculate_bounders(len(objects_paths), procs_io_bound)
diff --git a/awswrangler/session.py b/awswrangler/session.py
@@ -107,9 +107,7 @@ def _load_new_boto3_session(self):
         if self.aws_access_key_id and self.aws_secret_access_key:
             args["aws_access_key_id"] = self.aws_access_key_id
             args["aws_secret_access_key"] = self.aws_secret_access_key
-
         self._boto3_session = boto3.Session(**args)
-
         self._profile_name = self._boto3_session.profile_name
         self._aws_access_key_id = self._boto3_session.get_credentials().access_key
         self._aws_secret_access_key = self._boto3_session.get_credentials().secret_key
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -1,12 +1,12 @@
-from typing import List, Tuple, Dict
+from typing import List, Tuple, Dict, Any
 import logging
 import os
 
 import pandas as pd  # type: ignore
 
 from pyspark.sql.functions import pandas_udf, PandasUDFType, spark_partition_id
 from pyspark.sql.types import TimestampType
-from pyspark.sql import DataFrame
+from pyspark.sql import DataFrame, SparkSession
 
 from awswrangler.exceptions import MissingBatchDetected, UnsupportedFileFormat
 
@@ -18,14 +18,22 @@
 class Spark:
     def __init__(self, session):
         self._session = session
+        cpus: int = os.cpu_count()
+        if cpus == 1:
+            self._procs_io_bound: int = 1
+        else:
+            self._procs_io_bound = int(cpus / 2)
+        logging.info(f"_procs_io_bound: {self._procs_io_bound}")
 
-    def read_csv(self, **args):
-        spark = self._session.spark_session
+    def read_csv(self, **args) -> DataFrame:
+        spark: SparkSession = self._session.spark_session
         return spark.read.csv(**args)
 
     @staticmethod
-    def _extract_casts(dtypes):
-        casts = {}
+    def _extract_casts(dtypes: List[Tuple[str, str]]) -> Dict[str, str]:
+        casts: Dict[str, str] = {}
+        name: str
+        dtype: str
         for name, dtype in dtypes:
             if dtype in ["smallint", "int", "bigint"]:
                 casts[name] = "bigint"
@@ -35,7 +43,9 @@ def _extract_casts(dtypes):
         return casts
 
     @staticmethod
-    def date2timestamp(dataframe):
+    def date2timestamp(dataframe: DataFrame) -> DataFrame:
+        name: str
+        dtype: str
         for name, dtype in dataframe.dtypes:
             if dtype == "date":
                 dataframe = dataframe.withColumn(name, dataframe[name].cast(TimestampType()))
@@ -44,19 +54,19 @@ def date2timestamp(dataframe):
 
     def to_redshift(
             self,
-            dataframe,
-            path,
-            connection,
-            schema,
-            table,
-            iam_role,
-            diststyle="AUTO",
+            dataframe: DataFrame,
+            path: str,
+            connection: Any,
+            schema: str,
+            table: str,
+            iam_role: str,
+            diststyle: str = "AUTO",
             distkey=None,
-            sortstyle="COMPOUND",
+            sortstyle: str = "COMPOUND",
             sortkey=None,
-            min_num_partitions=200,
-            mode="append",
-    ):
+            min_num_partitions: int = 200,
+            mode: str = "append",
+    ) -> None:
         """
         Load Spark Dataframe as a Table on Amazon Redshift
 
@@ -78,54 +88,58 @@ def to_redshift(
         if path[-1] != "/":
             path += "/"
         self._session.s3.delete_objects(path=path)
-        spark = self._session.spark_session
-        casts = Spark._extract_casts(dataframe.dtypes)
+        spark: SparkSession = self._session.spark_session
+        casts: Dict[str, str] = Spark._extract_casts(dataframe.dtypes)
         dataframe = Spark.date2timestamp(dataframe)
         dataframe.cache()
-        num_rows = dataframe.count()
+        num_rows: int = dataframe.count()
         logger.info(f"Number of rows: {num_rows}")
+        num_partitions: int
         if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
             num_partitions = 1
         else:
-            num_slices = self._session.redshift.get_number_of_slices(redshift_conn=connection)
+            num_slices: int = self._session.redshift.get_number_of_slices(redshift_conn=connection)
             logger.debug(f"Number of slices on Redshift: {num_slices}")
             num_partitions = num_slices
             while num_partitions < min_num_partitions:
                 num_partitions += num_slices
         logger.debug(f"Number of partitions calculated: {num_partitions}")
         spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         session_primitives = self._session.primitives
+        par_col_name: str = "aws_data_wrangler_internal_partition_id"
 
         @pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
-        def write(pandas_dataframe):
+        def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
             # Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
             # a temporary workaround while waiting for Apache Arrow updates
             # https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
             os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
 
-            del pandas_dataframe["aws_data_wrangler_internal_partition_id"]
-            paths = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
-                                                                 path=path,
-                                                                 preserve_index=False,
-                                                                 mode="append",
-                                                                 procs_cpu_bound=1,
-                                                                 cast_columns=casts)
+            del pandas_dataframe[par_col_name]
+            paths: List[str] = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
+                                                                            path=path,
+                                                                            preserve_index=False,
+                                                                            mode="append",
+                                                                            procs_cpu_bound=1,
+                                                                            procs_io_bound=1,
+                                                                            cast_columns=casts)
             return pd.DataFrame.from_dict({"objects_paths": paths})
 
-        df_objects_paths = dataframe.repartition(numPartitions=num_partitions) \
-            .withColumn("aws_data_wrangler_internal_partition_id", spark_partition_id()) \
-            .groupby("aws_data_wrangler_internal_partition_id") \
-            .apply(write)
+        df_objects_paths = dataframe.repartition(numPartitions=num_partitions)  # type: ignore
+        df_objects_paths = df_objects_paths.withColumn(par_col_name, spark_partition_id())  # type: ignore
+        df_objects_paths = df_objects_paths.groupby(par_col_name).apply(write)  # type: ignore
 
-        objects_paths = list(df_objects_paths.toPandas()["objects_paths"])
+        objects_paths: List[str] = list(df_objects_paths.toPandas()["objects_paths"])
         dataframe.unpersist()
-        num_files_returned = len(objects_paths)
+        num_files_returned: int = len(objects_paths)
         if num_files_returned != num_partitions:
             raise MissingBatchDetected(f"{num_files_returned} files returned. {num_partitions} expected.")
         logger.debug(f"List of objects returned: {objects_paths}")
         logger.debug(f"Number of objects returned from UDF: {num_files_returned}")
-        manifest_path = f"{path}manifest.json"
-        self._session.redshift.write_load_manifest(manifest_path=manifest_path, objects_paths=objects_paths)
+        manifest_path: str = f"{path}manifest.json"
+        self._session.redshift.write_load_manifest(manifest_path=manifest_path,
+                                                   objects_paths=objects_paths,
+                                                   procs_io_bound=self._procs_io_bound)
         self._session.redshift.load_table(dataframe=dataframe,
                                           dataframe_type="spark",
                                           manifest_path=manifest_path,
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -164,7 +164,7 @@ Flatten nested PySpark DataFrame
 
     session = awswrangler.Session(spark_session=spark)
     dfs = session.spark.flatten(dataframe=df_nested)
-    for name, df_flat in dfs:
+    for name, df_flat in dfs.items():
         print(name)
         df_flat.show()
 
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py
@@ -280,6 +280,36 @@ def test_to_redshift_spark_big(session, bucket, redshift_parameters):
     assert len(list(dataframe.columns)) == len(list(rows[0]))
 
 
+def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
+    dataframe = session.spark_session.createDataFrame(
+        pd.DataFrame({
+            "A": list(range(1_000_000)),
+            "B": list(range(1_000_000)),
+            "C": list(range(1_000_000))
+        }))
+
+    for i in range(10):
+        print(i)
+        con = Redshift.generate_connection(
+            database="test",
+            host=redshift_parameters.get("RedshiftAddress"),
+            port=redshift_parameters.get("RedshiftPort"),
+            user="test",
+            password=redshift_parameters.get("RedshiftPassword"),
+        )
+        session.spark.to_redshift(
+            dataframe=dataframe,
+            path=f"s3://{bucket}/redshift-load/",
+            connection=con,
+            schema="public",
+            table="test",
+            iam_role=redshift_parameters.get("RedshiftRole"),
+            mode="overwrite",
+            min_num_partitions=4,
+        )
+        con.close()
+
+
 @pytest.mark.parametrize(
     "sample_name,mode,factor,diststyle,distkey,exc,sortstyle,sortkey",
     [
diff --git a/testing/test_awswrangler/test_s3.py b/testing/test_awswrangler/test_s3.py