Skip to content

Commit 19fc8d3

Browse files
jaoanan1126Calvin WangJoan Aoanan
authored
Issue #49 - implemented setPredefinedTypes(), #34- Fixed addAnomalyCheck signature (#56)
* #49-setPredefinedTypes();#34-_AnalyzerObject Co-authored-by: Calvin Wang <[email protected]> Co-authored-by: Joan Aoanan <[email protected]>
1 parent 30375bb commit 19fc8d3

File tree

5 files changed

+59
-18
lines changed

5 files changed

+59
-18
lines changed

README.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# PyDeequ
22

3-
PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. PyDeequ is written to support usage of Deequ in Python.
3+
PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. PyDeequ is written to support usage of Deequ in Python .
44

55
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) ![Coverage](https://img.shields.io/badge/coverage-90%25-green)
66

@@ -11,7 +11,11 @@ There are 4 main components of Deequ, and they are:
1111
- Constraint Suggestion:
1212
- Specify rules for various groups of Analyzers to be run over a dataset to return back a collection of constraints suggested to run in a Verification Suite.
1313
- Constraint Verification:
14+
<<<<<<< HEAD
15+
- Perform data validation on a dataset with respect to various constraints set by you.
16+
=======
1417
- Perform data validation on a dataset with respect to various constraints set by you.
18+
>>>>>>> 30375bb8645728a539b7b2f6d2d85f89266ac047
1519
- Metrics Repository
1620
- Allows for persistence and tracking of Deequ runs over time.
1721

@@ -152,13 +156,14 @@ Please refer to the [contributing doc](https://github.com/awslabs/python-deequ/b
152156

153157
This library is licensed under the Apache 2.0 License.
154158

155-
## Getting Started
159+
******
160+
161+
## Contributing Developer Setup
156162

157163
1. Setup [SDKMAN](#setup-sdkman)
158164
1. Setup [Java](#setup-java)
159165
1. Setup [Apache Spark](#setup-apache-spark)
160166
1. Install [Poetry](#poetry)
161-
1. Install Pre-commit and [follow instruction in here](PreCommit.MD)
162167
1. Run [tests locally](#running-tests-locally)
163168

164169
### Setup SDKMAN
@@ -232,4 +237,4 @@ Take a look at tests in `tests/dataquality` and `tests/jobs`
232237

233238
```bash
234239
$ poetry run pytest
235-
```
240+
```

pydeequ/analyzers.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from pydeequ.pandas_utils import ensure_pyspark_df
1010
from pydeequ.repository import MetricsRepository, ResultKey
11+
from enum import Enum
1112
from pydeequ.scala_utils import to_scala_seq
1213

1314

@@ -798,3 +799,28 @@ def _analyzer_jvm(self):
798799
return self._deequAnalyzers.UniqueValueRatio(
799800
to_scala_seq(self._jvm, self.columns), self._jvm.scala.Option.apply(self.where)
800801
)
802+
803+
class DataTypeInstances(Enum):
804+
"""
805+
An enum class that types columns to scala datatypes
806+
"""
807+
Boolean = "Boolean"
808+
Unknown = "Unknown"
809+
Fractional = "Fractional"
810+
Integral = "Integral"
811+
String = "String"
812+
813+
def _create_java_object(self, jvm):
814+
dataType_analyzers_class = jvm.com.amazon.deequ.analyzers.DataTypeInstances
815+
if self == DataTypeInstances.String:
816+
return dataType_analyzers_class.String()
817+
elif self == DataTypeInstances.Boolean:
818+
return dataType_analyzers_class.Boolean()
819+
elif self == DataTypeInstances.Unknown:
820+
return dataType_analyzers_class.Unknown()
821+
elif self == DataTypeInstances.Integral:
822+
return dataType_analyzers_class.Integral()
823+
elif self == DataTypeInstances.Fractional:
824+
return dataType_analyzers_class.Fractional()
825+
else:
826+
raise ValueError(f"{jvm} is not a valid datatype Object")

pydeequ/profiles.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
# -*- coding: utf-8 -*-
22
""" Profiles file for all the Profiles classes in Deequ"""
3-
# from pydeequ.analyzers import *
4-
# from pydeequ.metrics import *
53
import json
64
from collections import namedtuple
75

86
from pyspark.sql import DataFrame, SparkSession
9-
107
from pydeequ.analyzers import KLLParameters
118
from pydeequ.metrics import BucketDistribution
129
from pydeequ.pandas_utils import ensure_pyspark_df
10+
from enum import Enum
1311
from pydeequ.scala_utils import (
1412
get_or_else_none,
1513
java_list_to_python_list,
@@ -181,14 +179,18 @@ def setKLLParameters(self, kllParameters: KLLParameters):
181179
self._ColumnProfilerRunBuilder.setKLLParameters(self._jvm.scala.Option.apply(kllParameters._param))
182180
return self
183181

184-
def setPredefinedTypes(self, dataTypes: dict):
182+
def setPredefinedTypes(self, dataTypesDict: dict):
185183
"""
186184
Set predefined data types for each column (e.g. baseline)
187185
188-
:param dict dataTypes: dataType map for baseline columns
189-
:return: Baseline for each column
186+
:param dict{"columnName": DataTypeInstance} dataTypes: dataType map for baseline columns.
187+
:return: Baseline for each column. I.E. returns the dataType label to the desired DataTypeInstance
190188
"""
191-
self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataTypes))
189+
dataType_scala_map = {}
190+
for key, value in dataTypesDict.items():
191+
val = value._create_java_object(self._jvm)
192+
dataType_scala_map[key] = val
193+
self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataType_scala_map))
192194
return self
193195

194196
def useRepository(self, repository):
@@ -513,3 +515,4 @@ def approxPercentiles(self):
513515
:return: gets the approximate percentiles of the column
514516
"""
515517
return self._approxPercentiles
518+

pydeequ/verification.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
# -*- coding: utf-8 -*-
2-
# from pydeequ.analyzers import *
3-
# from pydeequ.anomaly_detection import *
1+
from pydeequ.analyzers import _AnalyzerObject
42
import json
53

64
from pyspark import SQLContext
@@ -179,7 +177,7 @@ def addCheck(self, check: Check):
179177
self._VerificationRunBuilder.addCheck(check._Check)
180178
return self
181179

182-
def addAnomalyCheck(self, anomaly, analyzer: AnalysisRunBuilder, anomalyCheckConfig=None):
180+
def addAnomalyCheck(self, anomaly, analyzer: _AnalyzerObject, anomalyCheckConfig=None):
183181
"""
184182
Add a check using anomaly_detection methods. The Anomaly Detection Strategy only checks
185183
if the new value is an Anomaly.

tests/test_profiles.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
# -*- coding: utf-8 -*-
22
import unittest
3-
43
from pyspark.sql import Row
5-
64
from pydeequ.analyzers import KLLParameters
75
from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
6+
from pydeequ.analyzers import KLLParameters, DataTypeInstances
87
from tests.conftest import setup_pyspark
98

10-
119
class TestProfiles(unittest.TestCase):
1210
@classmethod
1311
def setUpClass(cls):
@@ -20,11 +18,22 @@ def tearDownClass(cls):
2018
cls.spark.sparkContext._gateway.shutdown_callback_server()
2119
cls.spark.stop()
2220

21+
def test_setPredefinedTypes(self):
22+
result = ColumnProfilerRunner(self.spark) \
23+
.onData(self.df) \
24+
.setPredefinedTypes({'a': DataTypeInstances.Unknown, 'b': DataTypeInstances.String, 'c': DataTypeInstances.Fractional}) \
25+
.run()
26+
print(result)
27+
for col, profile in result.profiles.items():
28+
print("Profiles:", profile)
29+
2330
def test_profile_run(self):
2431
result = ColumnProfilerRunner(self.spark).onData(self.df).run()
2532
for col, profile in result.profiles.items():
33+
print(profile)
2634
print(f"col: {col} -> profile: {profile}")
2735

36+
print("Results: ", result)
2837
print(result.profiles["a"].column, result.profiles["a"].completeness)
2938

3039
def test_kll_and_approxPercentiles(self):

0 commit comments

Comments
 (0)