Refactor V2StreamingReadTest: add withSQLConf helper, inline fileFilter, use assertDataEquals

huan233usc · huan233usc · commit f877e5eadb9a · 2026-02-10T05:35:16.000Z
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java
@@ -22,18 +22,17 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.catalyst.expressions.Expression;
 import org.apache.spark.sql.catalyst.expressions.Literal$;
 import org.apache.spark.sql.delta.DeltaLog;
-import org.apache.spark.sql.delta.actions.AddFile;
 import org.apache.spark.sql.delta.stats.StatisticsCollection;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.io.TempDir;
-import scala.Function1;
 import scala.Option;
 import scala.collection.JavaConverters;
-import scala.runtime.AbstractFunction1;
 
 /** Tests for V2 streaming read operations. */
 public class V2StreamingReadTest extends V2TestBase {
@@ -121,44 +120,41 @@ public void testStreamingReadAfterStatsRecompute(@TempDir File deltaTablePath) t
     String tablePath = deltaTablePath.getAbsolutePath();
 
     // Write data with stats collection disabled - files will have no stats
-    spark.conf().set("spark.databricks.delta.stats.collect", "false");
-    try {
-      spark
-          .range(10)
-          .selectExpr("id", "cast(id as string) as value")
-          .write()
-          .format("delta")
-          .save(tablePath);
-    } finally {
-      spark.conf().set("spark.databricks.delta.stats.collect", "true");
-    }
+    withSQLConf(
+        "spark.databricks.delta.stats.collect",
+        "false",
+        () ->
+            spark
+                .range(10)
+                .selectExpr("id", "cast(id as string) as value")
+                .write()
+                .format("delta")
+                .save(tablePath));
 
     // Recompute statistics - this re-adds files with updated stats (dataChange=false),
     // creating duplicate AddFile entries in the log that must be filtered by selection vector
     DeltaLog deltaLog = DeltaLog.forTable(spark, tablePath);
-    scala.collection.immutable.Seq<Expression> predicates =
+    StatisticsCollection.recompute(
+        spark,
+        deltaLog,
+        Option.empty(),
         JavaConverters.<Expression>asScalaBuffer(
                 new ArrayList<>(List.of((Expression) Literal$.MODULE$.apply(true))))
-            .toList();
-    Function1<AddFile, Object> fileFilter =
-        new AbstractFunction1<AddFile, Object>() {
-          @Override
-          public Object apply(AddFile af) {
-            return (Object) Boolean.TRUE;
-          }
-        };
-    StatisticsCollection.recompute(spark, deltaLog, Option.empty(), predicates, fileFilter);
+            .toList(),
+        af -> (Object) Boolean.TRUE);
 
     // Stream via V2 - should see each row exactly once, not duplicated
     String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
     Dataset<Row> streamingDF = spark.readStream().table(dsv2TableRef);
 
     List<Row> actualRows = processStreamingQuery(streamingDF, "test_stats_recompute");
 
-    assertEquals(
-        10,
-        actualRows.size(),
-        "Stats recompute should not cause duplicate rows in streaming read. Got: " + actualRows);
+    List<Row> expectedRows =
+        LongStream.range(0, 10)
+            .mapToObj(i -> RowFactory.create(i, String.valueOf(i)))
+            .collect(Collectors.toList());
+
+    assertDataEquals(actualRows, expectedRows);
   }
 
   /**
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2TestBase.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2TestBase.java
@@ -147,6 +147,24 @@ protected List<Row> processStreamingQuery(Dataset<Row> streamingDF, String query
     }
   }
 
+  /**
+   * Runs the given action with a Spark SQL configuration temporarily set, then restores the
+   * original value afterwards (similar to Scala's {@code withSQLConf}).
+   */
+  protected void withSQLConf(String key, String value, Runnable action) {
+    scala.Option<String> original = spark.conf().getOption(key);
+    spark.conf().set(key, value);
+    try {
+      action.run();
+    } finally {
+      if (original.isDefined()) {
+        spark.conf().set(key, original.get());
+      } else {
+        spark.conf().unset(key);
+      }
+    }
+  }
+
   /**
    * Asserts that rows equal the expected rows (order-independent).
    *