Fixing mypy issues

igorborgest · igorborgest · commit f170891072cb · 2019-10-14T13:33:30.000-03:00
diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -12,7 +12,7 @@
 import awswrangler.utils  # noqa
 import awswrangler.data_types  # noqa
 
-if importlib.util.find_spec("pyspark"):
+if importlib.util.find_spec("pyspark"):  # type: ignore
     from awswrangler.spark import Spark  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -5,7 +5,7 @@
 import pyarrow as pa  # type: ignore
 import pandas as pd  # type: ignore
 
-from awswrangler.exceptions import UnsupportedType, UndetectedType  # type: ignore
+from awswrangler.exceptions import UnsupportedType, UndetectedType
 
 logger = logging.getLogger(__name__)
 
@@ -283,7 +283,8 @@ def spark2redshift(dtype: str) -> str:
         raise UnsupportedType("Unsupported Spark type: " + dtype)
 
 
-def convert_schema(func: Callable, schema: List[Tuple[str, str]]) -> Dict[str, str]:
+def convert_schema(func: Callable,
+                   schema: List[Tuple[str, str]]) -> Dict[str, str]:
     """
     Convert schema in the format of {"col name": "bigint", "col2 name": "int"}
     applying some data types conversion function (e.g. spark2redshift)
@@ -297,7 +298,8 @@ def convert_schema(func: Callable, schema: List[Tuple[str, str]]) -> Dict[str, s
 
 def extract_pyarrow_schema_from_pandas(dataframe: pd.DataFrame,
                                        preserve_index: bool,
-                                       indexes_position: str = "right") -> List[Tuple[str, str]]:
+                                       indexes_position: str = "right"
+                                       ) -> List[Tuple[str, str]]:
     """
     Extract the related Pyarrow schema from any Pandas DataFrame
 
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -6,9 +6,9 @@
 import csv
 from datetime import datetime
 
-import pandas as pd
-import pyarrow as pa
-from pyarrow import parquet as pq
+import pandas as pd  # type: ignore
+import pyarrow as pa  # type: ignore
+from pyarrow import parquet as pq  # type: ignore
 
 from awswrangler import data_types
 from awswrangler.exceptions import (UnsupportedWriteMode,
@@ -1058,7 +1058,8 @@ def normalize_columns_names_athena(dataframe, inplace=True):
         return dataframe
 
     @staticmethod
-    def drop_duplicated_columns(dataframe, inplace=True):
+    def drop_duplicated_columns(dataframe: pd.DataFrame,
+                                inplace: bool = True) -> pd.DataFrame:
         if inplace is False:
             dataframe = dataframe.copy(deep=True)
         duplicated_cols = dataframe.columns.duplicated()
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -1,7 +1,7 @@
 import json
 import logging
 
-import pg8000
+import pg8000  # type: ignore
 
 from awswrangler import data_types
 from awswrangler.exceptions import (
diff --git a/awswrangler/session.py b/awswrangler/session.py
@@ -2,8 +2,8 @@
 import logging
 import importlib
 
-import boto3
-from botocore.config import Config
+import boto3  # type: ignore
+from botocore.config import Config  # type: ignore
 
 from awswrangler.s3 import S3
 from awswrangler.athena import Athena
@@ -13,7 +13,7 @@
 from awswrangler.redshift import Redshift
 
 PYSPARK_INSTALLED = False
-if importlib.util.find_spec("pyspark"):
+if importlib.util.find_spec("pyspark"):  # type: ignore
     PYSPARK_INSTALLED = True
     from awswrangler.spark import Spark
 
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -1,11 +1,9 @@
 from typing import List, Tuple, Dict
 import logging
 
-import pandas as pd
+import pandas as pd  # type: ignore
 
-from pyspark.sql.functions import pandas_udf, PandasUDFType, spark_partition_id
-from pyspark.sql.types import TimestampType
-from pyspark.sql import DataFrame
+from pyspark import sql
 
 from awswrangler.exceptions import MissingBatchDetected, UnsupportedFileFormat
 
@@ -38,7 +36,7 @@ def date2timestamp(dataframe):
         for name, dtype in dataframe.dtypes:
             if dtype == "date":
                 dataframe = dataframe.withColumn(
-                    name, dataframe[name].cast(TimestampType()))
+                    name, dataframe[name].cast(sql.types.TimestampType()))
                 logger.warning(
                     f"Casting column {name} from date to timestamp!")
         return dataframe
@@ -98,8 +96,9 @@ def to_redshift(
         spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         session_primitives = self._session.primitives
 
-        @pandas_udf(returnType="objects_paths string",
-                    functionType=PandasUDFType.GROUPED_MAP)
+        @sql.functions.pandas_udf(
+            returnType="objects_paths string",
+            functionType=sql.functions.PandasUDFType.GROUPED_MAP)
         def write(pandas_dataframe):
             del pandas_dataframe["aws_data_wrangler_internal_partition_id"]
             paths = session_primitives.session.pandas.to_parquet(
@@ -112,7 +111,7 @@ def write(pandas_dataframe):
             return pd.DataFrame.from_dict({"objects_paths": paths})
 
         df_objects_paths = dataframe.repartition(numPartitions=num_partitions) \
-            .withColumn("aws_data_wrangler_internal_partition_id", spark_partition_id()) \
+            .withColumn("aws_data_wrangler_internal_partition_id", sql.functions.spark_partition_id()) \
             .groupby("aws_data_wrangler_internal_partition_id") \
             .apply(write)
 
@@ -227,7 +226,8 @@ def _is_map(dtype: str) -> bool:
 
     @staticmethod
     def _is_array_or_map(dtype: str) -> bool:
-        return True if (dtype.startswith("array") or dtype.startswith("map")) else False
+        return True if (dtype.startswith("array")
+                        or dtype.startswith("map")) else False
 
     @staticmethod
     def _parse_aux(path: str, aux: str) -> Tuple[str, str]:
@@ -242,19 +242,22 @@ def _parse_aux(path: str, aux: str) -> Tuple[str, str]:
 
     @staticmethod
     def _flatten_struct_column(path: str, dtype: str) -> List[Tuple[str, str]]:
-        dtype: str = dtype[7:-1]  # Cutting off "struct<" and ">"
+        dtype = dtype[7:-1]  # Cutting off "struct<" and ">"
         cols: List[Tuple[str, str]] = []
         struct_acc: int = 0
         path_child: str
         dtype_child: str
         aux: str = ""
-        for c, i in zip(dtype, range(len(dtype), 0, -1)):  # Zipping a descendant ID for each letter
+        for c, i in zip(dtype,
+                        range(len(dtype), 0,
+                              -1)):  # Zipping a descendant ID for each letter
             if ((c == ",") and (struct_acc == 0)) or (i == 1):
                 if i == 1:
                     aux += c
                 path_child, dtype_child = Spark._parse_aux(path=path, aux=aux)
                 if Spark._is_struct(dtype=dtype_child):
-                    cols += Spark._flatten_struct_column(path=path_child, dtype=dtype_child)  # Recursion
+                    cols += Spark._flatten_struct_column(
+                        path=path_child, dtype=dtype_child)  # Recursion
                 elif Spark._is_array(dtype=dtype):
                     cols.append((path, "array"))
                 else:
@@ -271,10 +274,10 @@ def _flatten_struct_column(path: str, dtype: str) -> List[Tuple[str, str]]:
         return cols
 
     @staticmethod
-    def _flatten_struct_dataframe(
-            df: DataFrame,
-            explode_outer: bool = True,
-            explode_pos: bool = True) -> List[Tuple[str, str, str]]:
+    def _flatten_struct_dataframe(df: sql.DataFrame,
+                                  explode_outer: bool = True,
+                                  explode_pos: bool = True
+                                  ) -> List[Tuple[str, str, str]]:
         explode: str = "EXPLODE_OUTER" if explode_outer is True else "EXPLODE"
         explode = f"POS{explode}" if explode_pos is True else explode
         cols: List[Tuple[str, str]] = []
@@ -308,26 +311,34 @@ def _flatten_struct_dataframe(
 
     @staticmethod
     def _build_name(name: str, expr: str) -> str:
-        suffix: str = expr[expr.find("(") + 1: expr.find(")")]
+        suffix: str = expr[expr.find("(") + 1:expr.find(")")]
         return f"{name}_{suffix}".replace(".", "_")
 
     @staticmethod
-    def flatten(
-            df: DataFrame,
-            explode_outer: bool = True,
-            explode_pos: bool = True,
-            name: str = "root") -> Dict[str, DataFrame]:
-        cols_exprs: List[Tuple[str, str, str]] = Spark._flatten_struct_dataframe(
-            df=df,
-            explode_outer=explode_outer,
-            explode_pos=explode_pos)
-        exprs_arr: List[str] = [x[2] for x in cols_exprs if Spark._is_array_or_map(x[1])]
-        exprs: List[str] = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1])]
-        dfs: Dict[str, DataFrame] = {name: df.selectExpr(exprs)}
-        exprs: List[str] = [x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1]) and not x[0].endswith("_pos")]
+    def flatten(df: sql.DataFrame,
+                explode_outer: bool = True,
+                explode_pos: bool = True,
+                name: str = "root") -> Dict[str, sql.DataFrame]:
+        cols_exprs: List[
+            Tuple[str, str, str]] = Spark._flatten_struct_dataframe(
+                df=df, explode_outer=explode_outer, explode_pos=explode_pos)
+        exprs_arr: List[str] = [
+            x[2] for x in cols_exprs if Spark._is_array_or_map(x[1])
+        ]
+        exprs: List[str] = [
+            x[2] for x in cols_exprs if not Spark._is_array_or_map(x[1])
+        ]
+        dfs: Dict[str, sql.DataFrame] = {name: df.selectExpr(exprs)}
+        exprs = [
+            x[2] for x in cols_exprs
+            if not Spark._is_array_or_map(x[1]) and not x[0].endswith("_pos")
+        ]
         for expr in exprs_arr:
             df_arr = df.selectExpr(exprs + [expr])
             name_new: str = Spark._build_name(name=name, expr=expr)
-            dfs_new = Spark.flatten(df=df_arr, explode_outer=explode_outer, explode_pos=explode_pos, name=name_new)
+            dfs_new = Spark.flatten(df=df_arr,
+                                    explode_outer=explode_outer,
+                                    explode_pos=explode_pos,
+                                    name=name_new)
             dfs = {**dfs, **dfs_new}
         return dfs
diff --git a/awswrangler/utils.py b/awswrangler/utils.py
@@ -47,8 +47,8 @@ def wait_process_release(processes):
         sleep(0.1)
 
 
-def lcm(a, b):
+def lcm(a: int, b: int) -> int:
     """
     Least Common Multiple
     """
-    return abs(a * b) // gcd(a, b)
+    return int(abs(a * b) // gcd(a, b))
diff --git a/testing/run-tests.sh b/testing/run-tests.sh
@@ -6,6 +6,7 @@ cd ..
 rm -rf *.pytest_cache
 yapf --in-place --recursive setup.py awswrangler testing/test_awswrangler
 flake8 setup.py awswrangler testing/test_awswrangler
+mypy awswrangler
 pip install -e .
 pytest testing/test_awswrangler awswrangler
 rm -rf *.pytest_cache