Add Spark.flatten() and mypy initial development

igorborgest · igorborgest · commit 2e2b0006991c · 2019-10-14T01:33:17.000-03:00
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -1,14 +1,16 @@
+from typing import List, Tuple, Dict, Callable
 import logging
 from datetime import datetime, date
 
-import pyarrow
+import pyarrow as pa  # type: ignore
+import pandas as pd  # type: ignore
 
-from awswrangler.exceptions import UnsupportedType, UndetectedType
+from awswrangler.exceptions import UnsupportedType, UndetectedType  # type: ignore
 
 logger = logging.getLogger(__name__)
 
 
-def athena2pandas(dtype):
+def athena2pandas(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
         return "Int64"
@@ -28,7 +30,7 @@ def athena2pandas(dtype):
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
 
-def athena2pyarrow(dtype):
+def athena2pyarrow(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "tinyint":
         return "int8"
@@ -54,7 +56,7 @@ def athena2pyarrow(dtype):
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
 
-def athena2python(dtype):
+def athena2python(dtype: str) -> type:
     dtype = dtype.lower()
     if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
         return int
@@ -72,7 +74,7 @@ def athena2python(dtype):
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
 
-def athena2redshift(dtype):
+def athena2redshift(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "smallint":
         return "SMALLINT"
@@ -96,7 +98,7 @@ def athena2redshift(dtype):
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
 
-def pandas2athena(dtype):
+def pandas2athena(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "int32":
         return "int"
@@ -116,7 +118,7 @@ def pandas2athena(dtype):
         raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
 
 
-def pandas2redshift(dtype):
+def pandas2redshift(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "int32":
         return "INTEGER"
@@ -136,7 +138,7 @@ def pandas2redshift(dtype):
         raise UnsupportedType("Unsupported Pandas type: " + dtype)
 
 
-def pyarrow2athena(dtype):
+def pyarrow2athena(dtype: pa.types) -> str:
     dtype_str = str(dtype).lower()
     if dtype_str == "int8":
         return "tinyint"
@@ -167,7 +169,7 @@ def pyarrow2athena(dtype):
         raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
 
-def pyarrow2redshift(dtype):
+def pyarrow2redshift(dtype: pa.types) -> str:
     dtype_str = str(dtype).lower()
     if dtype_str == "int16":
         return "SMALLINT"
@@ -191,25 +193,25 @@ def pyarrow2redshift(dtype):
         raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
 
-def python2athena(python_type):
-    python_type = str(python_type)
-    if python_type == "<class 'int'>":
+def python2athena(python_type: type) -> str:
+    python_type_str: str = str(python_type)
+    if python_type_str == "<class 'int'>":
         return "bigint"
-    elif python_type == "<class 'float'>":
+    elif python_type_str == "<class 'float'>":
         return "double"
-    elif python_type == "<class 'boll'>":
+    elif python_type_str == "<class 'boll'>":
         return "boolean"
-    elif python_type == "<class 'str'>":
+    elif python_type_str == "<class 'str'>":
         return "string"
-    elif python_type == "<class 'datetime.datetime'>":
+    elif python_type_str == "<class 'datetime.datetime'>":
         return "timestamp"
-    elif python_type == "<class 'datetime.date'>":
+    elif python_type_str == "<class 'datetime.date'>":
         return "date"
     else:
-        raise UnsupportedType(f"Unsupported Python type: {python_type}")
+        raise UnsupportedType(f"Unsupported Python type: {python_type_str}")
 
 
-def redshift2athena(dtype):
+def redshift2athena(dtype: str) -> str:
     dtype_str = str(dtype)
     if dtype_str in ["SMALLINT", "INT2"]:
         return "smallint"
@@ -233,8 +235,8 @@ def redshift2athena(dtype):
         raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
 
 
-def redshift2pyarrow(dtype):
-    dtype_str = str(dtype)
+def redshift2pyarrow(dtype: str) -> str:
+    dtype_str: str = str(dtype)
     if dtype_str in ["SMALLINT", "INT2"]:
         return "int16"
     elif dtype_str in ["INTEGER", "INT", "INT4"]:
@@ -257,7 +259,7 @@ def redshift2pyarrow(dtype):
         raise UnsupportedType(f"Unsupported Redshift type: {dtype_str}")
 
 
-def spark2redshift(dtype):
+def spark2redshift(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "smallint":
         return "SMALLINT"
@@ -281,7 +283,7 @@ def spark2redshift(dtype):
         raise UnsupportedType("Unsupported Spark type: " + dtype)
 
 
-def convert_schema(func, schema):
+def convert_schema(func: Callable, schema: List[Tuple[str, str]]) -> Dict[str, str]:
     """
     Convert schema in the format of {"col name": "bigint", "col2 name": "int"}
     applying some data types conversion function (e.g. spark2redshift)
@@ -293,16 +295,16 @@ def convert_schema(func, schema):
     return {name: func(dtype) for name, dtype in schema}
 
 
-def extract_pyarrow_schema_from_pandas(dataframe,
-                                       preserve_index,
-                                       indexes_position="right"):
+def extract_pyarrow_schema_from_pandas(dataframe: pd.DataFrame,
+                                       preserve_index: bool,
+                                       indexes_position: str = "right") -> List[Tuple[str, str]]:
     """
     Extract the related Pyarrow schema from any Pandas DataFrame
 
     :param dataframe: Pandas Dataframe
     :param preserve_index: True or False
     :param indexes_position: "right" or "left"
-    :return: Pyarrow schema (e.g. {"col name": "bigint", "col2 name": "int"})
+    :return: Pyarrow schema (e.g. [("col name": "bigint"), ("col2 name": "int")]
     """
     cols = []
     cols_dtypes = {}
@@ -319,8 +321,8 @@ def extract_pyarrow_schema_from_pandas(dataframe,
 
     # Filling cols_dtypes and indexes
     indexes = []
-    for field in pyarrow.Schema.from_pandas(df=dataframe[cols],
-                                            preserve_index=preserve_index):
+    for field in pa.Schema.from_pandas(df=dataframe[cols],
+                                       preserve_index=preserve_index):
         name = str(field.name)
         dtype = field.type
         cols_dtypes[name] = dtype
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -2,9 +2,9 @@
 from math import ceil
 import logging
 
-from botocore.exceptions import ClientError
-import s3fs
-import tenacity
+from botocore.exceptions import ClientError  # type: ignore
+import s3fs  # type: ignore
+import tenacity  # type: ignore
 
 from awswrangler.utils import calculate_bounders, wait_process_release
 
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -1,9 +1,11 @@
+from typing import List, Tuple, Dict
 import logging
 
 import pandas as pd
 
 from pyspark.sql.functions import pandas_udf, PandasUDFType, spark_partition_id
 from pyspark.sql.types import TimestampType
+from pyspark.sql import DataFrame
 
 from awswrangler.exceptions import MissingBatchDetected, UnsupportedFileFormat
 
@@ -210,3 +212,122 @@ def create_glue_table(self,
             extra_args=extra_args)
         if load_partitions:
             self._session.athena.repair_table(database=database, table=table)
+
+    @staticmethod
+    def _is_struct(dtype: str) -> bool:
+        return True if dtype.startswith("struct") else False
+
+    @staticmethod
+    def _is_array(dtype: str) -> bool:
+        return True if dtype.startswith("array") else False
+
+    @staticmethod
+    def _is_map(dtype: str) -> bool:
+        return True if dtype.startswith("map") else False
+
+    @staticmethod
+    def _is_array_or_map(dtype: str) -> bool:
+        return True if (dtype.startswith("array") or dtype.startswith("map")) else False
+
+    @staticmethod
+    def _parse_aux(path: str, aux: str) -> Tuple[str, str]:
+        path_child: str
+        dtype: str
+        if ":" in aux:
+            path_child, dtype = aux.split(sep=":", maxsplit=1)
+        else:
+            path_child = "element"
+            dtype = aux
+        return f"{path}.{path_child}", dtype
+
+    @staticmethod
+    def _flatten_struct_column(path: str, dtype: str) -> List[Tuple[str, str]]:
+        dtype: str = dtype[7:-1]  # Cutting off "struct<" and ">"
+        cols: List[Tuple[str, str]] = []
+        struct_acc: int = 0
+        path_child: str
+        dtype_child: str
+        aux: str = ""
+        for c, i in zip(dtype, range(len(dtype), 0, -1)):  # Zipping a descendant ID for each letter
+            if ((c == ",") and (struct_acc == 0)) or (i == 1):
+                if i == 1:
+                    aux += c
+                path_child, dtype_child = Spark._parse_aux(path=path, aux=aux)
+                if Spark._is_struct(dtype=dtype_child):
+                    cols += Spark._flatten_struct_column(path=path_child, dtype=dtype_child)  # Recursion
+                elif Spark._is_array(dtype=dtype):
+                    cols.append((path, "array"))
+                else:
+                    cols.append((path_child, dtype_child))
+                aux = ""
+            elif c == "<":
+                aux += c
+                struct_acc += 1
+            elif c == ">":
+                aux += c
+                struct_acc -= 1
+            else:
+                aux += c
+        return cols
+
+    @staticmethod
+    def _flatten_struct_dataframe(
+            df: DataFrame,
+            explode_outer: bool = True,
+            explode_pos: bool = True) -> List[Tuple[str, str, str]]:
+        explode: str = "EXPLODE_OUTER" if explode_outer is True else "EXPLODE"
+        explode = f"POS{explode}" if explode_pos is True else explode
+        cols: List[Tuple[str, str]] = []
+        for path, dtype in df.dtypes:
+            if Spark._is_struct(dtype=dtype):
+                cols += Spark._flatten_struct_column(path=path, dtype=dtype)
+            elif Spark._is_array(dtype=dtype):
+                cols.append((path, "array"))
+            elif Spark._is_map(dtype=dtype):
+                cols.append((path, "map"))
+            else:
+                cols.append((path, dtype))
+        cols_exprs: List[Tuple[str, str, str]] = []
+        expr: str
+        for path, dtype in cols:
+            path_under = path.replace('.', '_')
+            if Spark._is_array(dtype):
+                if explode_pos:
+                    expr = f"{explode}({path}) AS ({path_under}_pos, {path_under})"
+                else:
+                    expr = f"{explode}({path}) AS {path_under}"
+            elif Spark._is_map(dtype):
+                if explode_pos:
+                    expr = f"{explode}({path}) AS ({path_under}_pos, {path_under}_key, {path_under}_value)"
+                else:
+                    expr = f"{explode}({path}) AS ({path_under}_key, {path_under}_value)"
+            else:
+                expr = f"{path} AS {path.replace('.', '_')}"
+            cols_exprs.append((path, dtype, expr))
+        return cols_exprs
+
+    @staticmethod
+    def _build_name(name: str, expr: str) -> str:
+        suffix: str = expr[expr.find("(") + 1: expr.find(")")]
+        return f"{name}_{suffix}".replace(".", "_")
+
+    @staticmethod
+    def flatten(
+            df: DataFrame,
+            explode_outer: bool = True,
+            explode_pos: bool = True,
+            name: str = "root") -> Dict[str, DataFrame]:
+        cols_exprs: List[Tuple[str, str, str]] = Spark._flatten_struct_dataframe(
+            df=df,
+            explode_outer=explode_outer,
+            explode_pos=explode_pos)
+        exprs_arr: List[str] = [x[2] for x in cols_exprs if Spark._is_array_or_map(x[1])]
+        exprs: List[str] = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1])]
+        dfs: Dict[str, DataFrame] = {name: df.selectExpr(exprs)}
+        exprs: List[str] = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1]) and not x[0].endswith("_pos")]
+        for expr in exprs_arr:
+            df_arr = df.selectExpr(exprs + [expr])
+            name_new: str = Spark._build_name(name=name, expr=expr)
+            dfs_new = Spark.flatten(df=df_arr, explode_outer=explode_outer, explode_pos=explode_pos, name=name_new)
+            dfs = {**dfs, **dfs_new}
+        return dfs
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,4 +6,5 @@ twine~=1.13.0
 pyspark~=2.4.3
 wheel~=0.33.6
 sphinx~=2.1.2
-pyspark-stubs~=2.4.0
+pyspark-stubs~=2.4.0
+mypy~=0.730