Support new SeparateLines Dataframe Diff output (#209)

happytomatoe · web-flow · commit 44388e501aff · 2026-01-05T22:17:26.000+11:00
* Changes:
- add truncate option to dataframe diff output
- Remove cell width as it's a duplicate of row width - as Ufansi colors don't change string length
 - extract cellToString method as it has better implementation when converting dataframe cell to string
diff --git a/README.md b/README.md
@@ -68,6 +68,13 @@ assertSmallDataFrameEquality(sourceDF, expectedDF)
 <p>
     <img src="./images/assertSmallDataFrameEquality_DatasetContentMissmatch_message.png" alt="assertSmallDataFrameEquality_DatasetContentMissmatch_message" width="500", height="200"/>
 </p>
+Or if you prefer wide dataframes
+```scala
+assertSmallDataFrameEquality(..., outputFormat = DataframeDiffOutputFormat.SeparateLines)
+```
+<p>
+    <img src="./images/assertSmallWideDataframe.png" alt="assertSmallWideDataframe"/>
+</p>
 
 The `assertSmallDatasetEquality` method can be used to compare two Datasets or DataFrames(Dataset).
 Nicely formatted error messages are displayed when the Datasets are not equal. Here is an example of content mismatch:
diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparer.scala
@@ -1,10 +1,27 @@
 package com.github.mrpowers.spark.fast.tests
 
+import com.github.mrpowers.spark.fast.tests.SeqLikesExtensions.SeqExtensions
 import org.apache.spark.sql.{DataFrame, Row}
+import com.github.mrpowers.spark.fast.tests.DataframeDiffOutputFormat.DataframeDiffOutputFormat
+
 trait DataFrameComparer extends DatasetComparer {
 
   /**
    * Raises an error unless `actualDF` and `expectedDF` are equal
+   * @param actualDF
+   *   \- actual dataframe
+   * @param expectedDF
+   *   \- expected dataframe
+   * @param ignoreNullable
+   *   \- ignore nullable parameter when matching schemas
+   * @param ignoreColumnNames
+   *   \- ignore column names
+   * @param orderedComparison
+   *   \- if false sorts actual and expected
+   * @param ignoreMetadata
+   *   \- don't compare column metadata when matching schemas
+   * @param truncate
+   *   \- truncate column if length more than specified number
    */
   def assertSmallDataFrameEquality(
       actualDF: DataFrame,
@@ -14,18 +31,54 @@ trait DataFrameComparer extends DatasetComparer {
       orderedComparison: Boolean = true,
       ignoreColumnOrder: Boolean = false,
       ignoreMetadata: Boolean = true,
-      truncate: Int = 500
+      truncate: Int = 500,
+      outputFormat: DataframeDiffOutputFormat = DataframeDiffOutputFormat.SideBySide
   ): Unit = {
-    assertSmallDatasetEquality(
-      actualDF,
-      expectedDF,
-      ignoreNullable,
-      ignoreColumnNames,
-      orderedComparison,
-      ignoreColumnOrder,
-      ignoreMetadata,
-      truncate
-    )
+    outputFormat match {
+      case DataframeDiffOutputFormat.SideBySide =>
+        assertSmallDatasetEquality(
+          actualDF,
+          expectedDF,
+          ignoreNullable,
+          ignoreColumnNames,
+          orderedComparison,
+          ignoreColumnOrder,
+          ignoreMetadata,
+          truncate
+        )
+      case DataframeDiffOutputFormat.SeparateLines =>
+        SchemaComparer.assertSchemaEqual(
+          actualDF.schema,
+          expectedDF.schema,
+          ignoreNullable,
+          ignoreColumnNames,
+          ignoreColumnOrder,
+          ignoreMetadata
+        )
+        val actual = if (ignoreColumnOrder) orderColumns(actualDF, expectedDF) else actualDF
+        if (orderedComparison)
+          assertSmallDataFrameEquality(actual, expectedDF, truncate)
+        else
+          assertSmallDataFrameEquality(
+            defaultSortDataset(actual),
+            defaultSortDataset(expectedDF),
+            truncate
+          )
+    }
+
+  }
+
+  private def assertSmallDataFrameEquality(
+      actualDF: DataFrame,
+      expectedDF: DataFrame,
+      truncate: Int
+  ): Unit = {
+    val a = actualDF.collect()
+    val e = expectedDF.collect()
+    if (!a.toSeq.approximateSameElements(e, (o1: Row, o2: Row) => o1.equals(o2))) {
+      val msg = "Difference\n" ++ DataframeUtil.showDataframeDiff(a, e, actualDF.schema.fieldNames, truncate)
+      throw DatasetContentMismatch(msg)
+    }
   }
 
   /**
diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataFramePrettyPrint.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataFramePrettyPrint.scala
@@ -18,60 +18,7 @@ object DataFramePrettyPrint {
     // For cells that are beyond `truncate` characters, replace it with the
     // first `truncate-3` and "..."
     val rows: Seq[Seq[String]] = df.schema.fieldNames.toSeq +: data.map { row =>
-      row.toSeq.map { cell =>
-        val str = cell match {
-          case null => "null"
-          case binary: Array[Byte] =>
-            binary
-              .map("%02X".format(_))
-              .mkString(
-                "[",
-                " ",
-                "]"
-              )
-          case array: Array[_] =>
-            array.mkString(
-              "[",
-              ", ",
-              "]"
-            )
-          case seq: Seq[_] =>
-            seq.mkString(
-              "[",
-              ", ",
-              "]"
-            )
-          case d: Date =>
-            d.toLocalDate.format(DateTimeFormatter.ISO_DATE)
-          case r: Row =>
-            r.schema.fieldNames
-              .zip(r.toSeq)
-              .map { case (k, v) =>
-                s"$k -> $v"
-              }
-              .mkString(
-                "{",
-                ", ",
-                "}"
-              )
-          case _ => cell.toString
-        }
-        if (truncate > 0 && str.length > truncate) {
-          // do not show ellipses for strings shorter than 4 characters.
-          if (truncate < 4)
-            str.substring(
-              0,
-              truncate
-            )
-          else
-            str.substring(
-              0,
-              truncate - 3
-            ) + "..."
-        } else {
-          str
-        }
-      }: Seq[String]
+      row.toSeq.map(cellToString(_, truncate)): Seq[String]
     }
 
     val sb      = new StringBuilder
@@ -160,4 +107,65 @@ object DataFramePrettyPrint {
     sb.toString()
   }
 
+  /**
+   * Convert dataframe cell to string
+   * @param cell
+   *   \- cell value
+   * @param truncate
+   *   \-
+   */
+  private[mrpowers] def cellToString(cell: Any, truncate: Int): String = {
+    val str = cell match {
+      case null => "null"
+      case binary: Array[Byte] =>
+        binary
+          .map("%02X".format(_))
+          .mkString(
+            "[",
+            " ",
+            "]"
+          )
+      case array: Array[_] =>
+        array.mkString(
+          "[",
+          ", ",
+          "]"
+        )
+      case seq: Seq[_] =>
+        seq.mkString(
+          "[",
+          ", ",
+          "]"
+        )
+      case d: Date =>
+        d.toLocalDate.format(DateTimeFormatter.ISO_DATE)
+      case r: Row =>
+        r.schema.fieldNames
+          .zip(r.toSeq)
+          .map { case (k, v) =>
+            s"$k -> $v"
+          }
+          .mkString(
+            "{",
+            ", ",
+            "}"
+          )
+      case _ => cell.toString
+    }
+    if (truncate > 0 && str.length > truncate) {
+      // do not show ellipses for strings shorter than 4 characters.
+      if (truncate < 4)
+        str.substring(
+          0,
+          truncate
+        )
+      else
+        str.substring(
+          0,
+          truncate - 3
+        ) + "..."
+    } else {
+      str
+    }
+  }
 }
diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeDiffOutputFormat.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeDiffOutputFormat.scala
@@ -0,0 +1,6 @@
+package com.github.mrpowers.spark.fast.tests
+
+object DataframeDiffOutputFormat extends Enumeration {
+  type DataframeDiffOutputFormat = Value
+  val SideBySide, SeparateLines = Value
+}
diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeUtil.scala
@@ -0,0 +1,143 @@
+package com.github.mrpowers.spark.fast.tests
+
+import com.github.mrpowers.spark.fast.tests.ufansi.Color.{DarkGray, Green, Red}
+import com.github.mrpowers.spark.fast.tests.ufansi.FansiExtensions.StrOps
+import com.github.mrpowers.spark.fast.tests.ufansi.Str
+import org.apache.commons.lang3.StringUtils
+import org.apache.spark.sql.Row
+
+object DataframeUtil {
+  private val EmptyStr = Str("")
+  private[mrpowers] def showDataframeDiff(
+      actual: Array[Row],
+      expected: Array[Row],
+      fieldNames: Array[String],
+      truncate: Int = 20,
+      minColWidth: Int = 3
+  ): String = {
+    val sb        = new StringBuilder
+    val colWidths = getColWidths(fieldNames, actual.toSeq ++ expected.toSeq, truncate, minColWidth)
+    // needed to calculate padding for other elements
+    val largestIndexOffset = {
+      if (actual.isEmpty && expected.isEmpty) 0
+      else {
+        val maxIndex     = Math.max(actual.length, expected.length) - 1
+        val largestIndex = maxIndex.toString + ":" // Largest index that is visible on the side. Like `100:`
+        largestIndex.length + 1
+      }
+    }
+
+    val fullJoinWithEquals = actual
+      .zipAll(expected, Row.empty, Row.empty)
+      .map { case (actualRow, expectedRow) => (actualRow, expectedRow, actualRow.equals(expectedRow)) }
+    val diff = fullJoinWithEquals.map { case (actualRow, expectedRow, rowsAreEqual) =>
+      lazy val paddedActualRow   = pad(actualRow.toSeq, colWidths, truncate)
+      lazy val paddedExpectedRow = pad(expectedRow.toSeq, colWidths, truncate)
+      if (rowsAreEqual) {
+        List(DarkGray(paddedActualRow.mkString("|")), DarkGray(paddedActualRow.mkString("|")))
+      } else {
+        val actualSeq   = actualRow.toSeq
+        val expectedSeq = expectedRow.toSeq
+        if (actualSeq.isEmpty)
+          List(
+            EmptyStr,
+            Green(paddedExpectedRow.mkString("", "|", ""))
+          )
+        else if (expectedSeq.isEmpty)
+          List(Red(paddedActualRow.mkString("", "|", "")), EmptyStr)
+        else {
+          val withEquals = actualSeq
+            .zip(expectedSeq)
+            .map { case (actualRowField, expectedRowField) =>
+              (actualRowField, expectedRowField, actualRowField == expectedRowField)
+            }
+          val allFieldsAreNotEqual = !withEquals.exists(_._3)
+          if (allFieldsAreNotEqual) {
+            List(
+              Red(paddedActualRow.mkString("", "|", "")),
+              Green(paddedExpectedRow.mkString("", "|", ""))
+            )
+          } else {
+            val coloredDiff = withEquals.zipWithIndex
+              .map {
+                case ((actualRowField, expectedRowField, true), i) =>
+                  val paddedActualRow = padAny(actualRowField, colWidths(i), truncate)
+                  val paddedExpected  = padAny(expectedRowField, colWidths(i), truncate)
+                  (DarkGray(paddedActualRow), DarkGray(paddedExpected))
+                case ((actualRowField, expectedRowField, false), i) =>
+                  val paddedActualRow = padAny(actualRowField, colWidths(i), truncate)
+                  val paddedExpected  = padAny(expectedRowField, colWidths(i), truncate)
+                  (Red(paddedActualRow), Green(paddedExpected))
+              }
+            val start = DarkGray("")
+            val sep   = DarkGray("|")
+            val end   = DarkGray("")
+            List(
+              coloredDiff.map(_._1).mkStr(start, sep, end),
+              coloredDiff.map(_._2).mkStr(start, sep, end)
+            )
+          }
+        }
+      }
+    }
+
+    val headerWithLeftPadding = pad(fieldNames, colWidths, truncate)
+    val headerFields          = List(headerWithLeftPadding.mkString("|"))
+
+    val separatorLine: String =
+      colWidths
+        .map("-" * _)
+        .mkString(StringUtils.leftPad("+", largestIndexOffset), "+", "+\n")
+
+    sb.append(separatorLine)
+
+    headerFields
+      .zip(colWidths)
+      .map { case (cell, colWidth) =>
+        StringUtils.leftPad(cell, colWidth)
+      }
+      .addString(sb, StringUtils.leftPad("|", largestIndexOffset), "|", "|\n")
+    diff.zipWithIndex.foreach { case (actual :: expected :: Nil, i) =>
+      def appendRow(row: Str, i: Int): Unit = {
+        if (row.length > 0) {
+          val indexString = StringUtils.leftPad(s"${i + 1}:|", largestIndexOffset)
+          sb.append(indexString)
+          sb.append(row)
+          sb.append(s"|:${i + 1}\n")
+        }
+      }
+      appendRow(actual, i)
+      val rowsAreDifferent = !fullJoinWithEquals(i)._3
+      if (rowsAreDifferent) {
+        appendRow(expected, i)
+        if (i < diff.length - 1)
+          sb.append(separatorLine)
+      } else if (i < diff.length - 1 && !fullJoinWithEquals(i + 1)._3) { // if current rows are equal and next ones are not
+        sb.append(separatorLine)
+      }
+    }
+    sb.append(separatorLine).toString()
+  }
+
+  private def pad(items: Seq[Any], colWidths: Array[Int], truncateColumnLen: Int): Seq[String] =
+    items.zip(colWidths).map { case (v, colWidth) => padAny(v, colWidth, truncateColumnLen) }
+
+  private def padAny(s: Any, width: Int, truncateColumnLen: Int) = {
+    StringUtils.leftPad(DataFramePrettyPrint.cellToString(s, truncateColumnLen), width)
+  }
+
+  private def getColWidths(fields: Array[String], rows: Seq[Row], truncate: Int, minColWidth: Int) = {
+    val numCols = if (rows.isEmpty) 0 else rows.map(_.size).max
+    // Initialise the width of each column to a minimum value
+    val colWidths = Array.fill(numCols)(minColWidth)
+    for ((cell, i) <- fields.zipWithIndex) {
+      colWidths(i) = math.max(colWidths(i), DataFramePrettyPrint.cellToString(cell, truncate).length)
+    }
+    // Compute the width of each column
+    for (row <- rows)
+      for ((cell, i) <- row.toSeq.zipWithIndex)
+        colWidths(i) = math.max(colWidths(i), DataFramePrettyPrint.cellToString(cell, truncate).length)
+    colWidths
+  }
+
+}
diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala
diff --git a/images/assertSmallWideDataframe.png b/images/assertSmallWideDataframe.png