hasPattern Support (#36)

gucciwang · Calvin Wang · web-flow · commit 8733710c13f2 · 2021-05-11T11:44:34.000-07:00
* Refactor testing modules + testing spark3 bump

* hasPattern Check support

* Patch Check tests

Co-authored-by: Calvin Wang &lt;calviwan@amazon.com&gt;
diff --git a/pydeequ/__init__.py b/pydeequ/__init__.py
@@ -19,7 +19,8 @@
 from pydeequ.analyzers import AnalysisRunner
 from pydeequ.checks import Check, CheckLevel
 
-deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"
+deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-2.4-scala-2.11"
+# deequ_maven_coord = "com.amazon.deequ:deequ:1.1.0_spark-3.0-scala-2.12"
 f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"
 
 class PyDeequSession:
diff --git a/pydeequ/checks.py b/pydeequ/checks.py
@@ -557,7 +557,13 @@ def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
         :param str hint: A hint that states why a constraint could have failed.
         :return: hasPattern self: A Check object that runs the condition on the column.
         """
-        pass
+        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
+            else getattr(self._Check, "hasPattern$default$2")()
+        name = self._jvm.scala.Option.apply(name)
+        hint = self._jvm.scala.Option.apply(hint)
+        pattern_regex = self._jvm.scala.util.matching.Regex(pattern, None)
+        self._Check = self._Check.hasPattern(column, pattern_regex, assertion_func, name, hint)
+        return self
 
     def containsCreditCardNumber(self, column, assertion=None, hint=None):
         """
@@ -733,19 +739,22 @@ def isGreaterThanOrEqualTo(self, columnA, columnB, assertion=None, hint=None):
         self._Check = self._Check.isGreaterThanOrEqualTo(columnA, columnB, assertion_func, hint)
         return self
 
-    def isContainedIn(self, column, allowed_values):
+    def isContainedIn(self, column, allowed_values, assertion=None, hint=None):
         """
         Asserts that every non-null value in a column is contained in a set of predefined values
-
         :param str column: Column in DataFrame to run the assertion on.
         :param list[str] allowed_values: A function that accepts allowed values for the column.
+        :param lambda assertion: A function that accepts an int or float parameter.
         :param str hint: A hint that states why a constraint could have failed.
         :return: isContainedIn self: A Check object that runs the assertion on the columns.
         """
         arr = self._spark_session.sparkContext._gateway.new_array(self._jvm.java.lang.String, len(allowed_values))
         for i in range(0, len(allowed_values)):
             arr[i] = allowed_values[i]
-        self._Check = self._Check.isContainedIn(column, arr)
+        assertion_func = ScalaFunction1(self._spark_session.sparkContext._gateway, assertion) if assertion \
+            else getattr(self._Check, "IsOne")()
+        hint = self._jvm.scala.Option.apply(hint)
+        self._Check = self._Check.isContainedIn(column, arr, assertion_func, hint)
         return self
 
     def evaluate(self, context):
diff --git a/tests/test_analyzers.py b/tests/test_analyzers.py
@@ -1,14 +1,12 @@
 import unittest
 from pyspark.sql import SparkSession, Row, DataFrame
 from pydeequ.analyzers import *
-from pydeequ import PyDeequSession
+from pydeequ import *
 
 class TestAnalyzers(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"  # TODO: get Maven Coord from Configs
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"  # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                       .builder
                       .master('local[*]')
diff --git a/tests/test_anomaly_detection.py b/tests/test_anomaly_detection.py
@@ -4,12 +4,11 @@
 from pydeequ.anomaly_detection import *
 from pydeequ.repository import *
 from pydeequ.analyzers import *
+from pydeequ import *
 
 class TestAnomalies(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                  .builder
                  .master('local[*]')
diff --git a/tests/test_checks.py b/tests/test_checks.py
@@ -2,14 +2,11 @@
 from pyspark.sql import SparkSession, Row, DataFrame
 from pydeequ.verification import *
 from pydeequ.checks import *
-import py4j
+from pydeequ import *
 
 class TestChecks(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        # TODO share spark context between test cases?
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3" # TODO get Maven Coord from Configs
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                  .builder
                  .master('local[*]')
@@ -23,9 +20,9 @@ def setUpClass(cls):
                  .getOrCreate())
         cls.sc = cls.spark.sparkContext
         cls.df = cls.sc.parallelize([
-            Row(a="foo", b=1, c=5, d=5, e=3, f=1, g='a', h=0, creditCard="5130566665286573", email="foo@example.com", ssn="123-45-6789", URL="http://userid@example.com:8080", boolean="true"),
-            Row(a="bar", b=2, c=6, d=5, e=2, f=2, g='b', h=-1, creditCard="4532677117740914", email="bar@example.com", ssn="123456789", URL="http://foo.com/(something)?after=parens", boolean="false"),
-            Row(a="baz", b=3, c=None, d=5, e=1, f=1, g=None, h=2, creditCard="340145324521741", email="yourusername@example.com", ssn="000-00-0000", URL ="http://userid@example.com:8080", boolean="true")]).toDF()
+            Row(a="foo", b=1, c=5, d=5, e=3, f=1, g='a', h=0,     creditCard="5130566665286573", email="foo@example.com", ssn="123-45-6789", URL="http://userid@example.com:8080", boolean="true"),
+            Row(a="bar", b=2, c=6, d=5, e=2, f=2, g='b', h=-1,    creditCard="4532677117740914", email="bar@example.com", ssn="123456789", URL="http://foo.com/(something)?after=parens", boolean="false"),
+            Row(a="baz", b=3, c=None, d=5, e=1, f=1, g=None, h=2, creditCard="340145324521741", email="yourusername@meow.com", ssn="000-00-0000", URL ="http://userid@example.com:8080", boolean="true")]).toDF()
 
     @classmethod
     def tearDownClass(cls):
@@ -55,6 +52,16 @@ def hasSize(self, assertion, hint = None):
         df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
         return df.select('constraint_status').collect()
 
+    def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
+        check = Check(self.spark, CheckLevel.Warning, "test hasPattern")
+
+        result = VerificationSuite(self.spark).onData(self.df) \
+            .addCheck((check.hasPattern(column, pattern, assertion, name, hint))) \
+            .run()
+
+        df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
+        return df.select('constraint_status').collect()
+
     def containsCreditCardNumber(self, column, assertion=None, hint=None):
         check = Check(self.spark, CheckLevel.Warning, "test containsCreditCardNumber")
         result = VerificationSuite(self.spark).onData(self.df) \
@@ -405,6 +412,40 @@ def test_fail_hasSize(self):
         self.assertEqual(self.hasSize(lambda x: (x >2.0), "size of dataframe should be 3"),
                          [Row(constraint_status='Failure')])
 
+    def test_hasPattern(self):
+        self.assertEqual(self.hasPattern(column='email',
+                                         pattern=r".*@meow.com",
+                                         assertion=lambda x: x == 1/3),
+                         [Row(constraint_status='Success')])
+
+        self.assertEqual(self.hasPattern(column='creditCard',
+                                         pattern=r"\(|\)|\d{16}",
+                                         assertion=lambda x: x == 0.0),
+                         [Row(constraint_status='Failure')])
+
+        self.assertEqual(self.hasPattern(column='email',
+                                         pattern=r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
+                                         assertion=lambda x: x == 1.0),
+                         [Row(constraint_status='Success')])
+
+
+    @unittest.expectedFailure
+    def test_fail_hasPattern(self):
+        self.assertEqual(self.hasPattern(column='email',
+                                         pattern=r".*@meow.com",
+                                         assertion=lambda x: x == 2 / 3),
+                         [Row(constraint_status='Success')])
+
+        self.assertEqual(self.hasPattern(column='creditCard',
+                                         pattern=r"\(|\)|\d{16}",
+                                         assertion=lambda x: x == 1.0),
+                         [Row(constraint_status='Failure')])
+
+        self.assertEqual(self.hasPattern(column='email',
+                                         pattern=r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
+                                         assertion=lambda x: x == 0.0),
+                         [Row(constraint_status='Success')])
+
     def test_containsCreditCardNumber(self):
         self.assertEqual(self.containsCreditCardNumber("creditCard"), [Row(constraint_status='Success')])
         self.assertEqual(self.containsCreditCardNumber("creditCard", lambda x: x == 1.0, "All rows contain a credit card number"),
@@ -763,7 +804,7 @@ def test_fail_hasMinLength(self):
                          [Row(constraint_status='Success')])
 
     def test_hasMaxLength(self):
-        self.assertEqual(self.hasMaxLength("email", lambda x: x == 24, "Column email has 24 characters max"),
+        self.assertEqual(self.hasMaxLength("email", lambda x: x == 21, "Column email has 24 characters max"),
                          [Row(constraint_status='Success')])
         self.assertEqual(self.hasMaxLength('email', lambda x: x == 25, "does not meet criteria"),
                          [Row(constraint_status='Failure')])
diff --git a/tests/test_pandas_utils.py b/tests/test_pandas_utils.py
@@ -6,16 +6,14 @@
 from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
 from pydeequ.verification import *
 from pydeequ.checks import *
-from pydeequ import PyDeequSession
+from pydeequ import *
 from pandas import DataFrame as pandasDF
 import numpy as np
 
 class TestPandasUtils(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"  # TODO: get Maven Coord from Configs
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"  # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                       .builder
                       .master('local[*]')
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
@@ -2,13 +2,11 @@
 from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
 from pydeequ.analyzers import KLLParameters
 from pyspark.sql import SparkSession, Row
-
+from pydeequ import *
 
 class TestProfiles(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                  .builder
                  .master('local[*]')
diff --git a/tests/test_repository.py b/tests/test_repository.py
@@ -4,13 +4,12 @@
 from pydeequ.repository import *
 from pydeequ.verification import *
 from pydeequ.checks import *
+from pydeequ import *
 
 
 class TestRepository(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"  # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                       .builder
                       .master('local[*]')
diff --git a/tests/test_scala_utils.py b/tests/test_scala_utils.py
@@ -1,14 +1,12 @@
 import unittest
 from pydeequ.scala_utils import ScalaFunction1, ScalaFunction2
 from pyspark.sql import SparkSession
-
+from pydeequ import *
 
 class TestScalaUtils(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        # TODO share spark context between test cases?
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3-rc2" # TODO get Maven Coord from Configs
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig
+        # deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3-rc2" # TODO This ran rc2?
         cls.spark = (SparkSession
                  .builder
                  .master('local[*]')
diff --git a/tests/test_suggestions.py b/tests/test_suggestions.py
@@ -2,13 +2,11 @@
 from pyspark.sql import SparkSession, Row, DataFrame
 from pydeequ.suggestions import *
 import json
-
+from pydeequ import *
 
 class TestSuggestions(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3"
-        f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"  # This package is excluded because it causes an error in the SparkSession fig
         cls.spark = (SparkSession
                       .builder
                       .master('local[*]')
diff --git a/tutorials/hasPattern_check.ipynb b/tutorials/hasPattern_check.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Announcing the `hasPattern` Rule feature! \n",
+    "\n",
+    "This allows you to apply regex rule matching to your data's columns with PyDeequ's Verification suite! "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pydeequ\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append(\"/Users/calviwan/Desktop/myProjects/python-deequ\")\n",
+    "import pydeequ\n",
+    "\n",
+    "from pyspark.sql import SparkSession, Row\n",
+    "\n",
+    "spark = (SparkSession\n",
+    "    .builder\n",
+    "    .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n",
+    "    .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n",
+    "    .getOrCreate())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = spark.sparkContext.parallelize([\n",
+    "            Row(a=\"foo\", creditCard=\"5130566665286573\", email=\"foo@example.com\", ssn=\"123-45-6789\", URL=\"http://userid@example.com:8080\"),\n",
+    "            Row(a=\"bar\", creditCard=\"4532677117740914\", email=\"bar@example.com\", ssn=\"123456789\",   URL=\"http://foo.com/(something)?after=parens\"),\n",
+    "            Row(a=\"baz\", creditCard=\"3401453245217421\", email=\"foobar@baz.com\",  ssn=\"000-00-0000\", URL =\"http://userid@example.com:8080\")]).toDF()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
+      "|           check|check_level|check_status|          constraint|constraint_status|  constraint_message|\n",
+      "+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
+      "|Integrity checks|      Error|       Error|PatternMatchConst...|          Success|                    |\n",
+      "|Integrity checks|      Error|       Error|PatternMatchConst...|          Failure|Value: 0.66666666...|\n",
+      "|Integrity checks|      Error|       Error|PatternMatchConst...|          Success|                    |\n",
+      "+----------------+-----------+------------+--------------------+-----------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pydeequ.checks import *\n",
+    "from pydeequ.verification import *\n",
+    "\n",
+    "check = Check(spark, CheckLevel.Error, \"Integrity checks\")\n",
+    "\n",
+    "checkResult = VerificationSuite(spark) \\\n",
+    "    .onData(df) \\\n",
+    "    .addCheck(\n",
+    "        check.hasPattern(column='email',\n",
+    "                         pattern=r\".*@baz.com\",\n",
+    "                         assertion=lambda x: x == 1/3) \\\n",
+    "        .hasPattern(column='a',\n",
+    "                         pattern=r\"ba(r|z)\",\n",
+    "                         assertion=lambda x: x == 0/3) \\\n",
+    "        .hasPattern(column='email',\n",
+    "                     pattern=r\"\"\"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])\"\"\",\n",
+    "                     assertion=lambda x: x == 1.0)) \\\n",
+    "    .run()\n",
+    "\n",
+    "checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)\n",
+    "checkResult_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}