@@ -22,7 +22,7 @@ def setUpClass(cls):
2222 cls .df = cls .sc .parallelize ([
2323 Row (
a = "foo" ,
b = 1 ,
c = 5 ,
d = 5 ,
e = 3 ,
f = 1 ,
g = 'a' ,
h = 0 ,
creditCard = "5130566665286573" ,
email = "[email protected] " ,
ssn = "123-45-6789" ,
URL = "http://[email protected] :8080" ,
boolean = "true" ),
2424 Row (
a = "bar" ,
b = 2 ,
c = 6 ,
d = 5 ,
e = 2 ,
f = 2 ,
g = 'b' ,
h = - 1 ,
creditCard = "4532677117740914" ,
email = "[email protected] " ,
ssn = "123456789" ,
URL = "http://foo.com/(something)?after=parens" ,
boolean = "false" ),
25- Row (
a = "baz" ,
b = 3 ,
c = None ,
d = 5 ,
e = 1 ,
f = 1 ,
g = None ,
h = 2 ,
creditCard = "340145324521741 " ,
email = "yourusername@example .com" ,
ssn = "000-00-0000" ,
URL = "http://[email protected] :8080" ,
boolean = "true" )]).
toDF ()
25+ Row (
a = "baz" ,
b = 3 ,
c = None ,
d = 5 ,
e = 1 ,
f = 1 ,
g = None ,
h = 2 ,
creditCard = "3401453245217421 " ,
email = "yourusername@meow .com" ,
ssn = "000-00-0000" ,
URL = "http://[email protected] :8080" ,
boolean = "true" )]).
toDF ()
2626
2727 @classmethod
2828 def tearDownClass (cls ):
@@ -52,6 +52,16 @@ def hasSize(self, assertion, hint = None):
5252 df = VerificationResult .checkResultsAsDataFrame (self .spark , result )
5353 return df .select ('constraint_status' ).collect ()
5454
55+ def hasPattern (self , column , pattern , assertion = None , name = None , hint = None ):
56+ check = Check (self .spark , CheckLevel .Warning , "test hasPattern" )
57+
58+ result = VerificationSuite (self .spark ).onData (self .df ) \
59+ .addCheck ((check .hasPattern (column , pattern , assertion , name , hint ))) \
60+ .run ()
61+
62+ df = VerificationResult .checkResultsAsDataFrame (self .spark , result )
63+ return df .select ('constraint_status' ).collect ()
64+
5565 def containsCreditCardNumber (self , column , assertion = None , hint = None ):
5666 check = Check (self .spark , CheckLevel .Warning , "test containsCreditCardNumber" )
5767 result = VerificationSuite (self .spark ).onData (self .df ) \
@@ -402,6 +412,40 @@ def test_fail_hasSize(self):
402412 self .assertEqual (self .hasSize (lambda x : (x > 2.0 ), "size of dataframe should be 3" ),
403413 [Row (constraint_status = 'Failure' )])
404414
415+ def test_hasPattern (self ):
416+ self .assertEqual (self .hasPattern (column = 'email' ,
417+ pattern = r".*@meow.com" ,
418+ assertion = lambda x : x == 1 / 3 ),
419+ [Row (constraint_status = 'Success' )])
420+
421+ self .assertEqual (self .hasPattern (column = 'creditCard' ,
422+ pattern = r"\(|\)|\d{16}" ,
423+ assertion = lambda x : x == 0.0 ),
424+ [Row (constraint_status = 'Failure' )])
425+
426+ self .assertEqual (self .hasPattern (column = 'email' ,
427+ pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""" ,
428+ assertion = lambda x : x == 1.0 ),
429+ [Row (constraint_status = 'Success' )])
430+
431+
432+ @unittest .expectedFailure
433+ def test_fail_hasPattern (self ):
434+ self .assertEqual (self .hasPattern (column = 'email' ,
435+ pattern = r".*@meow.com" ,
436+ assertion = lambda x : x == 2 / 3 ),
437+ [Row (constraint_status = 'Success' )])
438+
439+ self .assertEqual (self .hasPattern (column = 'creditCard' ,
440+ pattern = r"\(|\)|\d{16}" ,
441+ assertion = lambda x : x == 1.0 ),
442+ [Row (constraint_status = 'Failure' )])
443+
444+ self .assertEqual (self .hasPattern (column = 'email' ,
445+ pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""" ,
446+ assertion = lambda x : x == 0.0 ),
447+ [Row (constraint_status = 'Success' )])
448+
405449 def test_containsCreditCardNumber (self ):
406450 self .assertEqual (self .containsCreditCardNumber ("creditCard" ), [Row (constraint_status = 'Success' )])
407451 self .assertEqual (self .containsCreditCardNumber ("creditCard" , lambda x : x == 1.0 , "All rows contain a credit card number" ),
0 commit comments