22from pyspark .sql import SparkSession , Row , DataFrame
33from pydeequ .verification import *
44from pydeequ .checks import *
5- import py4j
5+ from pydeequ import *
66
77class TestChecks (unittest .TestCase ):
88 @classmethod
99 def setUpClass (cls ):
10- # TODO share spark context between test cases?
11- deequ_maven_coord = "com.amazon.deequ:deequ:1.0.3" # TODO get Maven Coord from Configs
12- f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" # This package is excluded because it causes an error in the SparkSession fig
1310 cls .spark = (SparkSession
1411 .builder
1512 .master ('local[*]' )
@@ -23,9 +20,9 @@ def setUpClass(cls):
2320 .getOrCreate ())
2421 cls .sc = cls .spark .sparkContext
2522 cls .df = cls .sc .parallelize ([
26- Row (
a = "foo" ,
b = 1 ,
c = 5 ,
d = 5 ,
e = 3 ,
f = 1 ,
g = 'a' ,
h = 0 ,
creditCard = "5130566665286573" ,
email = "[email protected] " ,
ssn = "123-45-6789" ,
URL = "http://[email protected] :8080" ,
boolean = "true" ),
27- Row (
a = "bar" ,
b = 2 ,
c = 6 ,
d = 5 ,
e = 2 ,
f = 2 ,
g = 'b' ,
h = - 1 ,
creditCard = "4532677117740914" ,
email = "[email protected] " ,
ssn = "123456789" ,
URL = "http://foo.com/(something)?after=parens" ,
boolean = "false" ),
28- Row (
a = "baz" ,
b = 3 ,
c = None ,
d = 5 ,
e = 1 ,
f = 1 ,
g = None ,
h = 2 ,
creditCard = "340145324521741" ,
email = "yourusername@example .com" ,
ssn = "000-00-0000" ,
URL = "http://[email protected] :8080" ,
boolean = "true" )]).
toDF ()
23+ Row (
a = "foo" ,
b = 1 ,
c = 5 ,
d = 5 ,
e = 3 ,
f = 1 ,
g = 'a' ,
h = 0 ,
creditCard = "5130566665286573" ,
email = "[email protected] " ,
ssn = "123-45-6789" ,
URL = "http://[email protected] :8080" ,
boolean = "true" ),
24+ Row (
a = "bar" ,
b = 2 ,
c = 6 ,
d = 5 ,
e = 2 ,
f = 2 ,
g = 'b' ,
h = - 1 ,
creditCard = "4532677117740914" ,
email = "[email protected] " ,
ssn = "123456789" ,
URL = "http://foo.com/(something)?after=parens" ,
boolean = "false" ),
25+ Row (
a = "baz" ,
b = 3 ,
c = None ,
d = 5 ,
e = 1 ,
f = 1 ,
g = None ,
h = 2 ,
creditCard = "340145324521741" ,
email = "yourusername@meow .com" ,
ssn = "000-00-0000" ,
URL = "http://[email protected] :8080" ,
boolean = "true" )]).
toDF ()
2926
3027 @classmethod
3128 def tearDownClass (cls ):
@@ -55,6 +52,16 @@ def hasSize(self, assertion, hint = None):
5552 df = VerificationResult .checkResultsAsDataFrame (self .spark , result )
5653 return df .select ('constraint_status' ).collect ()
5754
55+ def hasPattern (self , column , pattern , assertion = None , name = None , hint = None ):
56+ check = Check (self .spark , CheckLevel .Warning , "test hasPattern" )
57+
58+ result = VerificationSuite (self .spark ).onData (self .df ) \
59+ .addCheck ((check .hasPattern (column , pattern , assertion , name , hint ))) \
60+ .run ()
61+
62+ df = VerificationResult .checkResultsAsDataFrame (self .spark , result )
63+ return df .select ('constraint_status' ).collect ()
64+
5865 def containsCreditCardNumber (self , column , assertion = None , hint = None ):
5966 check = Check (self .spark , CheckLevel .Warning , "test containsCreditCardNumber" )
6067 result = VerificationSuite (self .spark ).onData (self .df ) \
@@ -405,6 +412,40 @@ def test_fail_hasSize(self):
405412 self .assertEqual (self .hasSize (lambda x : (x > 2.0 ), "size of dataframe should be 3" ),
406413 [Row (constraint_status = 'Failure' )])
407414
415+ def test_hasPattern (self ):
416+ self .assertEqual (self .hasPattern (column = 'email' ,
417+ pattern = r".*@meow.com" ,
418+ assertion = lambda x : x == 1 / 3 ),
419+ [Row (constraint_status = 'Success' )])
420+
421+ self .assertEqual (self .hasPattern (column = 'creditCard' ,
422+ pattern = r"\(|\)|\d{16}" ,
423+ assertion = lambda x : x == 0.0 ),
424+ [Row (constraint_status = 'Failure' )])
425+
426+ self .assertEqual (self .hasPattern (column = 'email' ,
427+ pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""" ,
428+ assertion = lambda x : x == 1.0 ),
429+ [Row (constraint_status = 'Success' )])
430+
431+
432+ @unittest .expectedFailure
433+ def test_fail_hasPattern (self ):
434+ self .assertEqual (self .hasPattern (column = 'email' ,
435+ pattern = r".*@meow.com" ,
436+ assertion = lambda x : x == 2 / 3 ),
437+ [Row (constraint_status = 'Success' )])
438+
439+ self .assertEqual (self .hasPattern (column = 'creditCard' ,
440+ pattern = r"\(|\)|\d{16}" ,
441+ assertion = lambda x : x == 1.0 ),
442+ [Row (constraint_status = 'Failure' )])
443+
444+ self .assertEqual (self .hasPattern (column = 'email' ,
445+ pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""" ,
446+ assertion = lambda x : x == 0.0 ),
447+ [Row (constraint_status = 'Success' )])
448+
408449 def test_containsCreditCardNumber (self ):
409450 self .assertEqual (self .containsCreditCardNumber ("creditCard" ), [Row (constraint_status = 'Success' )])
410451 self .assertEqual (self .containsCreditCardNumber ("creditCard" , lambda x : x == 1.0 , "All rows contain a credit card number" ),
@@ -763,7 +804,7 @@ def test_fail_hasMinLength(self):
763804 [Row (constraint_status = 'Success' )])
764805
765806 def test_hasMaxLength (self ):
766- self .assertEqual (self .hasMaxLength ("email" , lambda x : x == 24 , "Column email has 24 characters max" ),
807+ self .assertEqual (self .hasMaxLength ("email" , lambda x : x == 21 , "Column email has 24 characters max" ),
767808 [Row (constraint_status = 'Success' )])
768809 self .assertEqual (self .hasMaxLength ('email' , lambda x : x == 25 , "does not meet criteria" ),
769810 [Row (constraint_status = 'Failure' )])
0 commit comments