diff --git a/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala b/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala index 302fee94d..c53566a19 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/PatternMatch.scala @@ -35,7 +35,11 @@ import scala.util.matching.Regex * @param pattern The regular expression to check for * @param where Additional filter to apply before the analyzer is run. */ -case class PatternMatch(column: String, pattern: Regex, where: Option[String] = None) +case class PatternMatch( + column: String, + pattern: Regex, + where: Option[String] = None, + isNullAllowed: Boolean = false) extends StandardScanShareableAnalyzer[NumMatchesAndCount]("PatternMatch", column) with FilterableAnalyzer { @@ -48,6 +52,7 @@ case class PatternMatch(column: String, pattern: Regex, where: Option[String] = override def aggregationFunctions(): Seq[Column] = { val expression = when(regexp_extract(col(column), pattern.toString(), 0) =!= lit(""), 1) + .when(lit(isNullAllowed) && col(column).isNull, 1) .otherwise(0) val summation = sum(conditionalSelection(expression, where).cast(IntegerType)) diff --git a/src/main/scala/com/amazon/deequ/checks/Check.scala b/src/main/scala/com/amazon/deequ/checks/Check.scala index 93a7b4076..4112841de 100644 --- a/src/main/scala/com/amazon/deequ/checks/Check.scala +++ b/src/main/scala/com/amazon/deequ/checks/Check.scala @@ -691,11 +691,19 @@ case class Check( pattern: Regex, assertion: Double => Boolean = Check.IsOne, name: Option[String] = None, - hint: Option[String] = None) + hint: Option[String] = None, + isNullAllowed: Boolean = false) : CheckWithLastConstraintFilterable = { addFilterableConstraint { filter => - Constraint.patternMatchConstraint(column, pattern, assertion, filter, name, hint) + Constraint.patternMatchConstraint( + column, + pattern, + assertion, + filter, + name, + hint, + isNullAllowed) } } diff --git a/src/main/scala/com/amazon/deequ/constraints/Constraint.scala b/src/main/scala/com/amazon/deequ/constraints/Constraint.scala index c7963ce41..00de8dfe3 100644 --- a/src/main/scala/com/amazon/deequ/constraints/Constraint.scala +++ b/src/main/scala/com/amazon/deequ/constraints/Constraint.scala @@ -303,10 +303,11 @@ object Constraint { assertion: Double => Boolean, where: Option[String] = None, name: Option[String] = None, - hint: Option[String] = None) + hint: Option[String] = None, + isNullAllowed: Boolean = false) : Constraint = { - val patternMatch = PatternMatch(column, pattern, where) + val patternMatch = PatternMatch(column, pattern, where, isNullAllowed) val constraint = AnalysisBasedConstraint[NumMatchesAndCount, Double, Double]( patternMatch, assertion, hint = hint) diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala index d512833dc..4d691c2c3 100644 --- a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala +++ b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala @@ -616,6 +616,27 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix assertSuccess(baseCheck.hasMaxLength("att1", _ == 4.0), context) } + "ignore null values for hasPattern constraints" in withSparkSession { spark => + import spark.implicits._ + + val df = Seq( + ("123", 1), + (null, 2), + ("456", 1) + ).toDF("nullable", "id") + + val check = Check(CheckLevel.Error, "some description") + .hasPattern("nullable", "\\d{3,3}".r, _ == 1.0) + val checkWithNullAllowed = Check(CheckLevel.Error, "some description") + .hasPattern("nullable", "\\d{3,3}".r, _ == 1.0, isNullAllowed = true) + + val context = runChecks(df, check) + val nullAllowedContext = runChecks(df, checkWithNullAllowed) + + assertEvaluatesTo(check, context, CheckStatus.Error) + assertEvaluatesTo(checkWithNullAllowed, nullAllowedContext, CheckStatus.Success) + } + "work on regular expression patterns for E-Mails" in withSparkSession { sparkSession => val col = "some" val df = dataFrameWithColumn(col, StringType, sparkSession, Row("someone@somewhere.org"),