Add first spark flatten test (Simple struct)

igorborgest · igorborgest · commit 7d68fc4a42ab · 2019-10-16T18:59:15.000-03:00
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -319,6 +319,16 @@ def flatten(df: sql.DataFrame,
                 explode_outer: bool = True,
                 explode_pos: bool = True,
                 name: str = "root") -> Dict[str, sql.DataFrame]:
+        """
+        Convert a complex nested DataFrame in one (or many) flat DataFrames
+        If a columns is a struct it is flatten directly.
+        If a columns is an array or map, then child DataFrames are created in different granularities.
+        :param df: Spark DataFrame
+        :param explode_outer: Should we preserve the null values on arrays?
+        :param explode_pos: Create columns with the index of the ex-array
+        :param name: The name of the root Dataframe
+        :return: A list of Dictionaries with the name as Keys and the DataFrames as Values
+        """
         cols_exprs: List[
             Tuple[str, str, str]] = Spark._flatten_struct_dataframe(
                 df=df, explode_outer=explode_outer, explode_pos=explode_pos)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,7 +4,7 @@ flake8~=3.7.8
 pytest-cov~=2.8.1
 cfn-lint~=0.23.3
 twine~=1.13.0
-pyspark~=2.4.4
 wheel~=0.33.6
 sphinx~=2.1.2
+pyspark~=2.4.4
 pyspark-stubs~=2.4.0
diff --git a/testing/run-tests.sh b/testing/run-tests.sh
@@ -3,11 +3,9 @@
 set -e
 
 cd ..
-rm -rf .pytest_cache .mypy_cache
 pip install -e .
 yapf --in-place --recursive setup.py awswrangler testing/test_awswrangler
 mypy awswrangler
 flake8 setup.py awswrangler testing/test_awswrangler
 pytest --cov=awswrangler testing/test_awswrangler
-rm -rf .pytest_cache .mypy_cache
 cd testing
diff --git a/testing/test_awswrangler/test_spark.py b/testing/test_awswrangler/test_spark.py
@@ -2,8 +2,10 @@
 
 import pytest
 import boto3
+import pandas as pd
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import lit, array, create_map, struct
+from pyspark.sql.types import StructType, StructField, IntegerType
 
 from awswrangler import Session
 
@@ -164,3 +166,41 @@ def test_create_glue_table_csv(session, bucket, database, compression,
     assert int(pandas_df.iloc[0]["id"]) == 4
     assert pandas_df.iloc[0]["name"] == "four"
     assert float(pandas_df.iloc[0]["value"]) == 4.0
+
+
+def test_flatten_simple_struct(session):
+    print()
+    pdf = pd.DataFrame({
+        "a": [1, 2],
+        "b": [
+            {
+                "bb1": 1,
+                "bb2": 2
+            },
+            {
+                "bb1": 1,
+                "bb2": 2
+            },
+        ],
+    })
+    schema = StructType([
+        StructField(name="a", dataType=IntegerType(), nullable=True),
+        StructField(name="b",
+                    dataType=StructType([
+                        StructField(name="bb1",
+                                    dataType=IntegerType(),
+                                    nullable=True),
+                        StructField(name="bb2",
+                                    dataType=IntegerType(),
+                                    nullable=True),
+                    ]),
+                    nullable=True),
+    ])
+    df = session.spark_session.createDataFrame(data=pdf, schema=schema)
+    df.printSchema()
+    dfs = session.spark.flatten(df=df)
+    assert len(dfs) == 1
+    dfs["root"].printSchema()
+    assert str(dfs["root"].dtypes
+               ) == "[('a', 'int'), ('b_bb1', 'int'), ('b_bb2', 'int')]"
+    assert df.count() == dfs["root"].count()