Skip to content

Commit 9c2ad28

Browse files
author
Calvin Wang
committed
hasPattern Check support
1 parent 73c88cc commit 9c2ad28

File tree

4 files changed

+185
-7
lines changed

4 files changed

+185
-7
lines changed

pydeequ/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from pydeequ.analyzers import AnalysisRunner
2020
from pydeequ.checks import Check, CheckLevel
2121

22-
# deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-2.4-scala-2.11"
23-
deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-3.0-scala-2.12"
22+
deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-2.4-scala-2.11"
23+
# deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-3.0-scala-2.12"
2424
f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"
2525

2626
class PyDeequSession:

pydeequ/checks.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,13 @@ def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
557557
:param str hint: A hint that states why a constraint could have failed.
558558
:return: hasPattern self: A Check object that runs the condition on the column.
559559
"""
560-
pass
560+
assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
561+
else getattr(self._Check, "hasPattern$default$2")()
562+
name = self._jvm.scala.Option.apply(name)
563+
hint = self._jvm.scala.Option.apply(hint)
564+
pattern_regex = self._jvm.scala.util.matching.Regex(pattern, None)
565+
self._Check = self._Check.hasPattern(column, pattern_regex, assertion_func, name, hint)
566+
return self
561567

562568
def containsCreditCardNumber(self, column, assertion=None, hint=None):
563569
"""
@@ -733,19 +739,22 @@ def isGreaterThanOrEqualTo(self, columnA, columnB, assertion=None, hint=None):
733739
self._Check = self._Check.isGreaterThanOrEqualTo(columnA, columnB, assertion_func, hint)
734740
return self
735741

736-
def isContainedIn(self, column, allowed_values):
742+
def isContainedIn(self, column, allowed_values, assertion=None, hint=None):
737743
"""
738744
Asserts that every non-null value in a column is contained in a set of predefined values
739-
740745
:param str column: Column in DataFrame to run the assertion on.
741746
:param list[str] allowed_values: A function that accepts allowed values for the column.
747+
:param lambda assertion: A function that accepts an int or float parameter.
742748
:param str hint: A hint that states why a constraint could have failed.
743749
:return: isContainedIn self: A Check object that runs the assertion on the columns.
744750
"""
745751
arr = self._spark_session.sparkContext._gateway.new_array(self._jvm.java.lang.String, len(allowed_values))
746752
for i in range(0, len(allowed_values)):
747753
arr[i] = allowed_values[i]
748-
self._Check = self._Check.isContainedIn(column, arr)
754+
assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
755+
else getattr(self._Check, "IsOne")()
756+
hint = self._jvm.scala.Option.apply(hint)
757+
self._Check = self._Check.isContainedIn(column, arr, assertion_func, hint)
749758
return self
750759

751760
def evaluate(self, context):

tests/test_checks.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def setUpClass(cls):
2222
cls.df = cls.sc.parallelize([
2323
Row(a="foo", b=1, c=5, d=5, e=3, f=1, g='a', h=0, creditCard="5130566665286573", email="[email protected]", ssn="123-45-6789", URL="http://[email protected]:8080", boolean="true"),
2424
Row(a="bar", b=2, c=6, d=5, e=2, f=2, g='b', h=-1, creditCard="4532677117740914", email="[email protected]", ssn="123456789", URL="http://foo.com/(something)?after=parens", boolean="false"),
25-
Row(a="baz", b=3, c=None, d=5, e=1, f=1, g=None, h=2, creditCard="340145324521741", email="yourusername@example.com", ssn="000-00-0000", URL ="http://[email protected]:8080", boolean="true")]).toDF()
25+
Row(a="baz", b=3, c=None, d=5, e=1, f=1, g=None, h=2, creditCard="3401453245217421", email="yourusername@meow.com", ssn="000-00-0000", URL ="http://[email protected]:8080", boolean="true")]).toDF()
2626

2727
@classmethod
2828
def tearDownClass(cls):
@@ -52,6 +52,16 @@ def hasSize(self, assertion, hint = None):
5252
df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
5353
return df.select('constraint_status').collect()
5454

55+
def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
56+
check = Check(self.spark, CheckLevel.Warning, "test hasPattern")
57+
58+
result = VerificationSuite(self.spark).onData(self.df) \
59+
.addCheck((check.hasPattern(column, pattern, assertion, name, hint))) \
60+
.run()
61+
62+
df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
63+
return df.select('constraint_status').collect()
64+
5565
def containsCreditCardNumber(self, column, assertion=None, hint=None):
5666
check = Check(self.spark, CheckLevel.Warning, "test containsCreditCardNumber")
5767
result = VerificationSuite(self.spark).onData(self.df) \
@@ -402,6 +412,40 @@ def test_fail_hasSize(self):
402412
self.assertEqual(self.hasSize(lambda x: (x >2.0), "size of dataframe should be 3"),
403413
[Row(constraint_status='Failure')])
404414

415+
def test_hasPattern(self):
416+
self.assertEqual(self.hasPattern(column='email',
417+
pattern=r".*@meow.com",
418+
assertion=lambda x: x == 1/3),
419+
[Row(constraint_status='Success')])
420+
421+
self.assertEqual(self.hasPattern(column='creditCard',
422+
pattern=r"\(|\)|\d{16}",
423+
assertion=lambda x: x == 0.0),
424+
[Row(constraint_status='Failure')])
425+
426+
self.assertEqual(self.hasPattern(column='email',
427+
pattern=r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
428+
assertion=lambda x: x == 1.0),
429+
[Row(constraint_status='Success')])
430+
431+
432+
@unittest.expectedFailure
433+
def test_fail_hasPattern(self):
434+
self.assertEqual(self.hasPattern(column='email',
435+
pattern=r".*@meow.com",
436+
assertion=lambda x: x == 2 / 3),
437+
[Row(constraint_status='Success')])
438+
439+
self.assertEqual(self.hasPattern(column='creditCard',
440+
pattern=r"\(|\)|\d{16}",
441+
assertion=lambda x: x == 1.0),
442+
[Row(constraint_status='Failure')])
443+
444+
self.assertEqual(self.hasPattern(column='email',
445+
pattern=r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
446+
assertion=lambda x: x == 0.0),
447+
[Row(constraint_status='Success')])
448+
405449
def test_containsCreditCardNumber(self):
406450
self.assertEqual(self.containsCreditCardNumber("creditCard"), [Row(constraint_status='Success')])
407451
self.assertEqual(self.containsCreditCardNumber("creditCard", lambda x: x == 1.0, "All rows contain a credit card number"),

tutorials/hasPattern_check.ipynb

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"\n"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"# Announcing the `hasPattern` Rule feature! \n",
15+
"\n",
16+
"This allows you to apply regex rule matching to your data's columns with PyDeequ's Verification suite! "
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 5,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"# import pydeequ\n",
26+
"\n",
27+
"import sys\n",
28+
"sys.path.append(\"/Users/calviwan/Desktop/myProjects/python-deequ\")\n",
29+
"import pydeequ\n",
30+
"\n",
31+
"from pyspark.sql import SparkSession, Row\n",
32+
"\n",
33+
"spark = (SparkSession\n",
34+
" .builder\n",
35+
" .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n",
36+
" .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n",
37+
" .getOrCreate())\n"
38+
]
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": 9,
43+
"metadata": {},
44+
"outputs": [],
45+
"source": [
46+
"df = spark.sparkContext.parallelize([\n",
47+
" Row(a=\"foo\", creditCard=\"5130566665286573\", email=\"[email protected]\", ssn=\"123-45-6789\", URL=\"http://[email protected]:8080\"),\n",
48+
" Row(a=\"bar\", creditCard=\"4532677117740914\", email=\"[email protected]\", ssn=\"123456789\", URL=\"http://foo.com/(something)?after=parens\"),\n",
49+
" Row(a=\"baz\", creditCard=\"3401453245217421\", email=\"[email protected]\", ssn=\"000-00-0000\", URL =\"http://[email protected]:8080\")]).toDF()"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 11,
55+
"metadata": {},
56+
"outputs": [
57+
{
58+
"name": "stdout",
59+
"output_type": "stream",
60+
"text": [
61+
"+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
62+
"| check|check_level|check_status| constraint|constraint_status| constraint_message|\n",
63+
"+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
64+
"|Integrity checks| Error| Error|PatternMatchConst...| Success| |\n",
65+
"|Integrity checks| Error| Error|PatternMatchConst...| Failure|Value: 0.66666666...|\n",
66+
"|Integrity checks| Error| Error|PatternMatchConst...| Success| |\n",
67+
"+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
68+
"\n"
69+
]
70+
}
71+
],
72+
"source": [
73+
"from pydeequ.checks import *\n",
74+
"from pydeequ.verification import *\n",
75+
"\n",
76+
"check = Check(spark, CheckLevel.Error, \"Integrity checks\")\n",
77+
"\n",
78+
"checkResult = VerificationSuite(spark) \\\n",
79+
" .onData(df) \\\n",
80+
" .addCheck(\n",
81+
" check.hasPattern(column='email',\n",
82+
" pattern=r\".*@baz.com\",\n",
83+
" assertion=lambda x: x == 1/3) \\\n",
84+
" .hasPattern(column='a',\n",
85+
" pattern=r\"ba(r|z)\",\n",
86+
" assertion=lambda x: x == 0/3) \\\n",
87+
" .hasPattern(column='email',\n",
88+
" pattern=r\"\"\"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])\"\"\",\n",
89+
" assertion=lambda x: x == 1.0)) \\\n",
90+
" .run()\n",
91+
"\n",
92+
"checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)\n",
93+
"checkResult_df.show()"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": []
102+
}
103+
],
104+
"metadata": {
105+
"kernelspec": {
106+
"display_name": "Python 3",
107+
"language": "python",
108+
"name": "python3"
109+
},
110+
"language_info": {
111+
"codemirror_mode": {
112+
"name": "ipython",
113+
"version": 3
114+
},
115+
"file_extension": ".py",
116+
"mimetype": "text/x-python",
117+
"name": "python",
118+
"nbconvert_exporter": "python",
119+
"pygments_lexer": "ipython3",
120+
"version": "3.7.3"
121+
}
122+
},
123+
"nbformat": 4,
124+
"nbformat_minor": 1
125+
}

0 commit comments

Comments
 (0)