[SPARK][SQL][TESTS] Add integration tests for streaming source naming and metadata paths

ericm-db · ericm-db · commit ee09cfbc0b59 · 2026-02-19T10:53:50.000-08:00
Add tests covering:
- Named sources produce named metadata paths (sources/&lt;name&gt;)
- Unnamed sources fall back to positional IDs (sources/0, sources/1, ...)
- Source evolution: reordering and adding sources with named sources
- Named source enforcement requires V2 (OffsetMap) offset log format
- Names are preserved correctly through multi-level union operations
- Naming sources without enforcement enabled raises an error

Also adds BeforeAndAfterEach, verifySourcePath helper (mockito), and
updates testWithSourceEvolution to set STREAMING_OFFSET_LOG_FORMAT_VERSION=2.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/StreamingQueryEvolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/StreamingQueryEvolutionSuite.scala
@@ -17,17 +17,52 @@
 
 package org.apache.spark.sql.streaming.test
 
-import org.scalatest.Tag
+import scala.concurrent.duration._
+
+import org.apache.hadoop.fs.Path
+import org.mockito.ArgumentMatchers.{any, eq => meq}
+import org.mockito.Mockito._
+import org.scalatest.{BeforeAndAfterEach, Tag}
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.sql.streaming.Trigger._
+import org.apache.spark.util.Utils
 
 /**
  * Test suite for streaming source naming and validation.
  * Tests cover the naming API, validation rules, and resolution pipeline.
  */
-class StreamingQueryEvolutionSuite extends StreamTest {
+class StreamingQueryEvolutionSuite extends StreamTest with BeforeAndAfterEach {
+  import testImplicits._
+
+  private def newMetadataDir =
+    Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
+
+  override def afterEach(): Unit = {
+    spark.streams.active.foreach(_.stop())
+    super.afterEach()
+  }
+
+  /**
+   * Helper to verify that a source was created with the expected metadata path.
+   * @param checkpointLocation the checkpoint location path
+   * @param sourcePath the expected source path (e.g., "source1" or "0")
+   * @param mode mockito verification mode (default: times(1))
+   */
+  private def verifySourcePath(
+      checkpointLocation: Path,
+      sourcePath: String,
+      mode: org.mockito.verification.VerificationMode = times(1)): Unit = {
+    verify(LastOptions.mockStreamSourceProvider, mode).createSource(
+      any(),
+      meq(s"${new Path(makeQualifiedPath(
+        checkpointLocation.toString)).toString}/sources/$sourcePath"),
+      meq(None),
+      meq("org.apache.spark.sql.streaming.test"),
+      meq(Map.empty))
+  }
 
   // ====================
   // Name Validation Tests
@@ -159,16 +194,258 @@ class StreamingQueryEvolutionSuite extends StreamTest {
     assert(union.isStreaming, "Union should be streaming")
   }
 
+  test("without enforcement - naming sources throws error") {
+    withSQLConf(SQLConf.ENABLE_STREAMING_SOURCE_EVOLUTION.key -> "false") {
+      checkError(
+        exception = intercept[AnalysisException] {
+          spark.readStream
+            .format("org.apache.spark.sql.streaming.test")
+            .name("mySource")
+            .load()
+        },
+        condition = "STREAMING_QUERY_EVOLUTION_ERROR.SOURCE_NAMING_NOT_SUPPORTED",
+        parameters = Map("name" -> "mySource"))
+    }
+  }
+
+  // =======================
+  // Metadata Path Tests
+  // =======================
+
+  testWithSourceEvolution("named sources - metadata path uses source name") {
+    LastOptions.clear()
+
+    val checkpointLocation = new Path(newMetadataDir)
+
+    val df1 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val df2 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source2")
+      .load()
+
+    val q = df1.union(df2).writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q.processAllAvailable()
+    q.stop()
+
+    verifySourcePath(checkpointLocation, "source1")
+    verifySourcePath(checkpointLocation, "source2")
+  }
+
+  test("unnamed sources use positional IDs for metadata path") {
+    withSQLConf(SQLConf.ENABLE_STREAMING_SOURCE_EVOLUTION.key -> "false") {
+      LastOptions.clear()
+
+      val checkpointLocation = new Path(newMetadataDir)
+
+      val df1 = spark.readStream
+        .format("org.apache.spark.sql.streaming.test")
+        .load()
+
+      val df2 = spark.readStream
+        .format("org.apache.spark.sql.streaming.test")
+        .load()
+
+      val q = df1.union(df2).writeStream
+        .format("org.apache.spark.sql.streaming.test")
+        .option("checkpointLocation", checkpointLocation.toString)
+        .trigger(ProcessingTime(10.seconds))
+        .start()
+      q.processAllAvailable()
+      q.stop()
+
+      // Without naming, sources get sequential IDs (Unassigned -> 0, 1, ...)
+      verifySourcePath(checkpointLocation, "0")
+      verifySourcePath(checkpointLocation, "1")
+    }
+  }
+
+  // ========================
+  // Source Evolution Tests
+  // ========================
+
+  testWithSourceEvolution("source evolution - reorder sources with named sources") {
+    LastOptions.clear()
+
+    val checkpointLocation = new Path(newMetadataDir)
+
+    // First query: source1 then source2
+    val df1a = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val df2a = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source2")
+      .load()
+
+    val q1 = df1a.union(df2a).writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q1.processAllAvailable()
+    q1.stop()
+
+    LastOptions.clear()
+
+    // Second query: source2 then source1 (reordered) - should still work
+    val df1b = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val df2b = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source2")
+      .load()
+
+    val q2 = df2b.union(df1b).writeStream  // Note: reversed order
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q2.processAllAvailable()
+    q2.stop()
+
+    // Both sources should still use their named paths
+    verifySourcePath(checkpointLocation, "source1", atLeastOnce())
+    verifySourcePath(checkpointLocation, "source2", atLeastOnce())
+  }
+
+  testWithSourceEvolution("source evolution - add new source with named sources") {
+    LastOptions.clear()
+
+    val checkpointLocation = new Path(newMetadataDir)
+
+    // First query: only source1
+    val df1 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val q1 = df1.writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q1.processAllAvailable()
+    q1.stop()
+
+    LastOptions.clear()
+
+    // Second query: add source2
+    val df1b = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val df2 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source2")
+      .load()
+
+    val q2 = df1b.union(df2).writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q2.processAllAvailable()
+    q2.stop()
+
+    // Both sources should have been created
+    verifySourcePath(checkpointLocation, "source1", atLeastOnce())
+    verifySourcePath(checkpointLocation, "source2")
+  }
+
+  testWithSourceEvolution("named sources enforcement uses V2 offset log format") {
+    LastOptions.clear()
+
+    val checkpointLocation = new Path(newMetadataDir)
+
+    val df1 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source1")
+      .load()
+
+    val df2 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("source2")
+      .load()
+
+    val q = df1.union(df2).writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q.processAllAvailable()
+    q.stop()
+
+    import org.apache.spark.sql.execution.streaming.checkpointing.{OffsetMap, OffsetSeqLog}
+    val offsetLog = new OffsetSeqLog(spark,
+      makeQualifiedPath(checkpointLocation.toString).toString + "/offsets")
+    val offsetSeq = offsetLog.get(0)
+    assert(offsetSeq.isDefined, "Offset log should have batch 0")
+    assert(offsetSeq.get.isInstanceOf[OffsetMap],
+      s"Expected OffsetMap but got ${offsetSeq.get.getClass.getSimpleName}")
+  }
+
+  testWithSourceEvolution("names preserved through union operations") {
+    LastOptions.clear()
+
+    val checkpointLocation = new Path(newMetadataDir)
+
+    val df1 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("alpha")
+      .load()
+
+    val df2 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("beta")
+      .load()
+
+    val df3 = spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .name("gamma")
+      .load()
+
+    // Complex union: (alpha union beta) union gamma
+    val q = df1.union(df2).union(df3).writeStream
+      .format("org.apache.spark.sql.streaming.test")
+      .option("checkpointLocation", checkpointLocation.toString)
+      .trigger(ProcessingTime(10.seconds))
+      .start()
+    q.processAllAvailable()
+    q.stop()
+
+    // All three sources should use their named paths
+    verifySourcePath(checkpointLocation, "alpha")
+    verifySourcePath(checkpointLocation, "beta")
+    verifySourcePath(checkpointLocation, "gamma")
+  }
+
   // ==============
   // Helper Methods
   // ==============
 
   /**
    * Helper method to run tests with source evolution enabled.
+   * Sets offset log format to V2 (OffsetMap) since named sources require it.
    */
   def testWithSourceEvolution(testName: String, testTags: Tag*)(testBody: => Any): Unit = {
     test(testName, testTags: _*) {
-      withSQLConf(SQLConf.ENABLE_STREAMING_SOURCE_EVOLUTION.key -> "true") {
+      withSQLConf(
+        SQLConf.ENABLE_STREAMING_SOURCE_EVOLUTION.key -> "true",
+        SQLConf.STREAMING_OFFSET_LOG_FORMAT_VERSION.key -> "2") {
         testBody
       }
     }