Issue #49 - implemented setPredefinedTypes(), #34- Fixed addAnomalyCheck signature (#56)

jaoanan1126 · Calvin Wang · Joan Aoanan · web-flow · commit 19fc8d3d1685 · 2021-07-19T14:15:24.000-07:00
* #49-setPredefinedTypes();#34-_AnalyzerObject Co-authored-by: Calvin Wang <calviwan@amazon.com> Co-authored-by: Joan Aoanan <jaoanan@amazon.com>
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # PyDeequ
 
-PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. PyDeequ is written to support usage of Deequ in Python.
+PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. PyDeequ is written to support usage of Deequ in Python .
 
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) ![Coverage](https://img.shields.io/badge/coverage-90%25-green)
 
@@ -11,7 +11,11 @@ There are 4 main components of Deequ, and they are:
 - Constraint Suggestion:
     - Specify rules for various groups of Analyzers to be run over a dataset to return back a collection of constraints suggested to run in a Verification Suite.
 - Constraint Verification:
+<<<<<<< HEAD
+    - Perform data validation on a dataset with respect to various constraints set by you.   
+=======
     - Perform data validation on a dataset with respect to various constraints set by you.
+>>>>>>> 30375bb8645728a539b7b2f6d2d85f89266ac047
 - Metrics Repository
     - Allows for persistence and tracking of Deequ runs over time.
 
@@ -152,13 +156,14 @@ Please refer to the [contributing doc](https://github.com/awslabs/python-deequ/b
 
 This library is licensed under the Apache 2.0 License.
 
-## Getting Started
+******
+
+## Contributing Developer Setup
 
 1. Setup [SDKMAN](#setup-sdkman)
 1. Setup [Java](#setup-java)
 1. Setup [Apache Spark](#setup-apache-spark)
 1. Install [Poetry](#poetry)
-1. Install Pre-commit and [follow instruction in here](PreCommit.MD)
 1. Run [tests locally](#running-tests-locally)
 
 ### Setup SDKMAN
@@ -232,4 +237,4 @@ Take a look at tests in `tests/dataquality` and `tests/jobs`
 
 ```bash
 $ poetry run pytest
-```
+```
diff --git a/pydeequ/analyzers.py b/pydeequ/analyzers.py
@@ -8,6 +8,7 @@
 
 from pydeequ.pandas_utils import ensure_pyspark_df
 from pydeequ.repository import MetricsRepository, ResultKey
+from enum import Enum
 from pydeequ.scala_utils import to_scala_seq
 
 
@@ -798,3 +799,28 @@ def _analyzer_jvm(self):
         return self._deequAnalyzers.UniqueValueRatio(
             to_scala_seq(self._jvm, self.columns), self._jvm.scala.Option.apply(self.where)
         )
+
+class DataTypeInstances(Enum):
+    """
+    An enum class that types columns to scala datatypes
+    """
+    Boolean = "Boolean"
+    Unknown = "Unknown"
+    Fractional = "Fractional"
+    Integral = "Integral"
+    String = "String"
+
+    def _create_java_object(self, jvm):
+        dataType_analyzers_class = jvm.com.amazon.deequ.analyzers.DataTypeInstances
+        if self == DataTypeInstances.String:
+            return dataType_analyzers_class.String()
+        elif self == DataTypeInstances.Boolean:
+            return dataType_analyzers_class.Boolean()
+        elif self == DataTypeInstances.Unknown:
+            return dataType_analyzers_class.Unknown()
+        elif self == DataTypeInstances.Integral:
+            return dataType_analyzers_class.Integral()
+        elif self == DataTypeInstances.Fractional:
+            return dataType_analyzers_class.Fractional()
+        else:
+            raise ValueError(f"{jvm} is not a valid datatype Object")
diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py
@@ -1,15 +1,13 @@
 # -*- coding: utf-8 -*-
 """ Profiles file for all the Profiles classes in Deequ"""
-# from pydeequ.analyzers import *
-# from pydeequ.metrics import *
 import json
 from collections import namedtuple
 
 from pyspark.sql import DataFrame, SparkSession
-
 from pydeequ.analyzers import KLLParameters
 from pydeequ.metrics import BucketDistribution
 from pydeequ.pandas_utils import ensure_pyspark_df
+from enum import Enum
 from pydeequ.scala_utils import (
     get_or_else_none,
     java_list_to_python_list,
@@ -181,14 +179,18 @@ def setKLLParameters(self, kllParameters: KLLParameters):
         self._ColumnProfilerRunBuilder.setKLLParameters(self._jvm.scala.Option.apply(kllParameters._param))
         return self
 
-    def setPredefinedTypes(self, dataTypes: dict):
+    def setPredefinedTypes(self, dataTypesDict: dict):
         """
         Set predefined data types for each column (e.g. baseline)
 
-        :param dict dataTypes: dataType map for baseline columns
-        :return: Baseline for each column
+        :param dict{"columnName": DataTypeInstance} dataTypes: dataType map for baseline columns.
+        :return: Baseline for each column. I.E. returns the dataType label to the desired DataTypeInstance
         """
-        self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataTypes))
+        dataType_scala_map = {}
+        for key, value in dataTypesDict.items():
+            val = value._create_java_object(self._jvm)
+            dataType_scala_map[key] = val
+        self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataType_scala_map))
         return self
 
     def useRepository(self, repository):
@@ -513,3 +515,4 @@ def approxPercentiles(self):
         :return: gets the approximate percentiles of the column
         """
         return self._approxPercentiles
+
diff --git a/pydeequ/verification.py b/pydeequ/verification.py
@@ -1,6 +1,4 @@
-# -*- coding: utf-8 -*-
-# from pydeequ.analyzers import *
-# from pydeequ.anomaly_detection import *
+from pydeequ.analyzers import _AnalyzerObject
 import json
 
 from pyspark import SQLContext
@@ -179,7 +177,7 @@ def addCheck(self, check: Check):
         self._VerificationRunBuilder.addCheck(check._Check)
         return self
 
-    def addAnomalyCheck(self, anomaly, analyzer: AnalysisRunBuilder, anomalyCheckConfig=None):
+    def addAnomalyCheck(self, anomaly, analyzer: _AnalyzerObject, anomalyCheckConfig=None):
         """
         Add a check using anomaly_detection methods. The Anomaly Detection Strategy only checks
         if the new value is an Anomaly.
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
@@ -1,13 +1,11 @@
 # -*- coding: utf-8 -*-
 import unittest
-
 from pyspark.sql import Row
-
 from pydeequ.analyzers import KLLParameters
 from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
+from pydeequ.analyzers import KLLParameters, DataTypeInstances
 from tests.conftest import setup_pyspark
 
-
 class TestProfiles(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -20,11 +18,22 @@ def tearDownClass(cls):
         cls.spark.sparkContext._gateway.shutdown_callback_server()
         cls.spark.stop()
 
+    def test_setPredefinedTypes(self):
+        result = ColumnProfilerRunner(self.spark) \
+            .onData(self.df) \
+            .setPredefinedTypes({'a': DataTypeInstances.Unknown, 'b': DataTypeInstances.String, 'c': DataTypeInstances.Fractional}) \
+            .run()
+        print(result)
+        for col, profile in result.profiles.items():
+            print("Profiles:", profile)
+
     def test_profile_run(self):
         result = ColumnProfilerRunner(self.spark).onData(self.df).run()
         for col, profile in result.profiles.items():
+            print(profile)
             print(f"col: {col} -> profile: {profile}")
 
+        print("Results: ", result)
         print(result.profiles["a"].column, result.profiles["a"].completeness)
 
     def test_kll_and_approxPercentiles(self):