11package com .target .data_validator .validator
22
3- import com .target .data_validator .{ValidatorCheckEvent , ValidatorCounter , ValidatorError , VarSubstitution }
3+ import com .target .data_validator .{ColumnBasedValidatorCheckEvent , ValidatorCounter , ValidatorError , VarSubstitution }
44import com .target .data_validator .JsonEncoders .eventEncoder
55import io .circe .Json
66import io .circe .syntax ._
@@ -10,11 +10,29 @@ import org.apache.spark.sql.catalyst.expressions.Expression
1010import org .apache .spark .sql .catalyst .expressions .aggregate .Max
1111import org .apache .spark .sql .types ._
1212
13+ import scala .collection .immutable .ListMap
14+ import scala .math .abs
15+
1316abstract class ColumnBased (column : String , condTest : Expression ) extends CheapCheck {
1417 override def select (schema : StructType , dict : VarSubstitution ): Expression = condTest
1518
1619 // ColumnBased checks don't have per row error details.
1720 def hasQuickErrorDetails : Boolean = false
21+
22+ // calculates and returns the pct error as a string
23+ def calculatePctError (expected : Double , actual : Double , formatStr : String = " %4.2f%%" ): String = {
24+
25+ if (expected == actual) {
26+ formatStr.format(0.00 ) // if expected == actual, error % should be 0, even if expected is 0
27+ }
28+ else if (expected == 0.0 ) {
29+ " undefined"
30+ }
31+ else {
32+ val pct = abs(((expected - actual) * 100.0 ) / expected)
33+ formatStr.format(pct)
34+ }
35+ }
1836}
1937
2038case class MinNumRows (minNumRows : Long ) extends ColumnBased (" " , ValidatorBase .L0 ) {
@@ -36,8 +54,11 @@ case class MinNumRows(minNumRows: Long) extends ColumnBased("", ValidatorBase.L0
3654
3755 override def quickCheck (row : Row , count : Long , idx : Int ): Boolean = {
3856 failed = count < minNumRows
57+ val pctError = if (failed) calculatePctError(minNumRows, count) else " 0.00%"
3958 addEvent(ValidatorCounter (" rowCount" , count))
40- addEvent(ValidatorCheckEvent (failed, s " MinNumRowCheck $minNumRows " , count, 1 ))
59+ val msg = s " MinNumRowsCheck Expected: $minNumRows Actual: $count Relative Error: $pctError"
60+ val data = ListMap (" expected" -> minNumRows.toString, " actual" -> count.toString, " relative_error" -> pctError)
61+ addEvent(ColumnBasedValidatorCheckEvent (failed, data, msg))
4162 failed
4263 }
4364
@@ -66,34 +87,58 @@ case class ColumnMaxCheck(column: String, value: Json)
6687 val dataType = row.schema(idx).dataType
6788 val rMax = row(idx)
6889 logger.info(s " rMax: $rMax colType: $dataType value: $value valueClass: ${value.getClass.getCanonicalName}" )
69- val num = value.asNumber
70- failed = dataType match {
71- case StringType => value.asString.exists(_ != row.getString(idx))
72- case ByteType => num.map(_.toByte).exists(_.get != row.getByte(idx))
73- case ShortType => num.map(_.toShort).exists(_.get != row.getShort(idx))
74- case IntegerType =>
75- val intNum = value.asNumber.map(_.toInt.get).getOrElse(- 1 )
76- val rowInt = row.getInt(idx)
77- logger.debug(s " intNum[ ${intNum.getClass.getCanonicalName}]: $intNum " +
78- s " rowInt[ ${rowInt.getClass.getCanonicalName}]: $rowInt" )
79- num.map(_.toInt).exists(_.get != row.getInt(idx))
80- case LongType => num.map(_.toLong).exists(_.get != row.getLong(idx))
81- case FloatType => num.forall(_.toDouble != row.getFloat(idx))
82- case DoubleType => num.forall(_.toDouble != row.getDouble(idx))
83- case ut =>
84- logger.error(s " quickCheck for type: $ut, Row: $row not Implemented! Please file this as a bug. " )
85- true // Fail check!
90+
91+ def resultForString : (ListMap [String , String ], String ) = {
92+ val (expected, actual) = (value.asString.getOrElse(" " ), row.getString(idx))
93+
94+ failed = expected != actual
95+ val data = ListMap (" expected" -> expected, " actual" -> actual)
96+ val errorMsg = s " ColumnMaxCheck $column[StringType]: Expected: $expected Actual: $actual"
97+
98+ (data, errorMsg)
99+ }
100+
101+ def resultForNumeric : (ListMap [String , String ], String ) = {
102+ val num = value.asNumber.get
103+ var cmp_params = (0.0 , 0.0 ) // (expected, actual)
104+
105+ dataType match {
106+ case ByteType => cmp_params = (num.toByte.getOrElse[Byte ](- 1 ), row.getByte(idx))
107+ case ShortType => cmp_params = (num.toShort.getOrElse[Short ](- 1 ), row.getShort(idx))
108+ case IntegerType => cmp_params = (num.toInt.getOrElse[Int ](- 1 ), row.getInt(idx))
109+ case LongType => cmp_params = (num.toLong.getOrElse[Long ](- 1 ), row.getLong(idx))
110+ case FloatType => cmp_params = (num.toDouble, row.getFloat(idx))
111+ case DoubleType => cmp_params = (num.toDouble, row.getDouble(idx))
112+ }
113+
114+ failed = cmp_params._1 != cmp_params._2
115+ val pctError = if (failed) calculatePctError(cmp_params._1, cmp_params._2) else " 0.00%"
116+ val data = ListMap (" expected" -> num.toString, " actual" -> rMax.toString, " relative_error" -> pctError)
117+ val errorMsg = s " ColumnMaxCheck $column[ $dataType]: Expected: $num Actual: $rMax Relative Error: $pctError"
118+
119+ (data, errorMsg)
120+ }
121+
122+ def resultForOther : (ListMap [String , String ], String ) = {
123+ logger.error(
124+ s """ ColumnMaxCheck for type: $dataType, Row: $row not implemented!
125+ |Please open a bug report on the data-validator issue tracker. """ .stripMargin
126+ )
127+ failed = true
128+ val errorMsg = s " ColumnMaxCheck is not supported for data type $dataType"
129+
130+ (ListMap .empty[String , String ], errorMsg)
131+ }
132+
133+ val (data, errorMsg) = dataType match {
134+ case StringType => resultForString
135+ case _ : NumericType => resultForNumeric
136+ case _ => resultForOther
86137 }
138+
87139 logger.debug(s " MaxValue compared Row: $row with value: $value failed: $failed" )
88140 if (failed) {
89- addEvent(
90- ValidatorCheckEvent (
91- failed,
92- s " columnMaxCheck column[ $dataType]: $column value: $value doesn't equal $rMax" ,
93- count,
94- 1
95- )
96- )
141+ addEvent(ColumnBasedValidatorCheckEvent (failed, data, errorMsg))
97142 }
98143 failed
99144 }
0 commit comments