|
22 | 22 | import java.util.ArrayList; |
23 | 23 | import java.util.Arrays; |
24 | 24 | import java.util.List; |
| 25 | +import java.util.stream.Collectors; |
| 26 | +import java.util.stream.LongStream; |
25 | 27 | import org.apache.spark.sql.*; |
26 | 28 | import org.apache.spark.sql.catalyst.expressions.Expression; |
27 | 29 | import org.apache.spark.sql.catalyst.expressions.Literal$; |
28 | 30 | import org.apache.spark.sql.delta.DeltaLog; |
29 | | -import org.apache.spark.sql.delta.actions.AddFile; |
30 | 31 | import org.apache.spark.sql.delta.stats.StatisticsCollection; |
31 | 32 | import org.junit.jupiter.api.*; |
32 | 33 | import org.junit.jupiter.api.io.TempDir; |
33 | | -import scala.Function1; |
34 | 34 | import scala.Option; |
35 | 35 | import scala.collection.JavaConverters; |
36 | | -import scala.runtime.AbstractFunction1; |
37 | 36 |
|
38 | 37 | /** Tests for V2 streaming read operations. */ |
39 | 38 | public class V2StreamingReadTest extends V2TestBase { |
@@ -121,44 +120,41 @@ public void testStreamingReadAfterStatsRecompute(@TempDir File deltaTablePath) t |
121 | 120 | String tablePath = deltaTablePath.getAbsolutePath(); |
122 | 121 |
|
123 | 122 | // Write data with stats collection disabled - files will have no stats |
124 | | - spark.conf().set("spark.databricks.delta.stats.collect", "false"); |
125 | | - try { |
126 | | - spark |
127 | | - .range(10) |
128 | | - .selectExpr("id", "cast(id as string) as value") |
129 | | - .write() |
130 | | - .format("delta") |
131 | | - .save(tablePath); |
132 | | - } finally { |
133 | | - spark.conf().set("spark.databricks.delta.stats.collect", "true"); |
134 | | - } |
| 123 | + withSQLConf( |
| 124 | + "spark.databricks.delta.stats.collect", |
| 125 | + "false", |
| 126 | + () -> |
| 127 | + spark |
| 128 | + .range(10) |
| 129 | + .selectExpr("id", "cast(id as string) as value") |
| 130 | + .write() |
| 131 | + .format("delta") |
| 132 | + .save(tablePath)); |
135 | 133 |
|
136 | 134 | // Recompute statistics - this re-adds files with updated stats (dataChange=false), |
137 | 135 | // creating duplicate AddFile entries in the log that must be filtered by selection vector |
138 | 136 | DeltaLog deltaLog = DeltaLog.forTable(spark, tablePath); |
139 | | - scala.collection.immutable.Seq<Expression> predicates = |
| 137 | + StatisticsCollection.recompute( |
| 138 | + spark, |
| 139 | + deltaLog, |
| 140 | + Option.empty(), |
140 | 141 | JavaConverters.<Expression>asScalaBuffer( |
141 | 142 | new ArrayList<>(List.of((Expression) Literal$.MODULE$.apply(true)))) |
142 | | - .toList(); |
143 | | - Function1<AddFile, Object> fileFilter = |
144 | | - new AbstractFunction1<AddFile, Object>() { |
145 | | - @Override |
146 | | - public Object apply(AddFile af) { |
147 | | - return (Object) Boolean.TRUE; |
148 | | - } |
149 | | - }; |
150 | | - StatisticsCollection.recompute(spark, deltaLog, Option.empty(), predicates, fileFilter); |
| 143 | + .toList(), |
| 144 | + af -> (Object) Boolean.TRUE); |
151 | 145 |
|
152 | 146 | // Stream via V2 - should see each row exactly once, not duplicated |
153 | 147 | String dsv2TableRef = str("dsv2.delta.`%s`", tablePath); |
154 | 148 | Dataset<Row> streamingDF = spark.readStream().table(dsv2TableRef); |
155 | 149 |
|
156 | 150 | List<Row> actualRows = processStreamingQuery(streamingDF, "test_stats_recompute"); |
157 | 151 |
|
158 | | - assertEquals( |
159 | | - 10, |
160 | | - actualRows.size(), |
161 | | - "Stats recompute should not cause duplicate rows in streaming read. Got: " + actualRows); |
| 152 | + List<Row> expectedRows = |
| 153 | + LongStream.range(0, 10) |
| 154 | + .mapToObj(i -> RowFactory.create(i, String.valueOf(i))) |
| 155 | + .collect(Collectors.toList()); |
| 156 | + |
| 157 | + assertDataEquals(actualRows, expectedRows); |
162 | 158 | } |
163 | 159 |
|
164 | 160 | /** |
|
0 commit comments