[SPARK-51747][SQL][FOLLOW-UP] Data source cached plan conf and migration guide

asl3 · gengliangwang · commit 39981866b679 · 2025-04-14T20:10:44.000-07:00
### What changes were proposed in this pull request? Follow-up to #50538. Add a SQL legacy conf to enable/disable the change to allow users to restore the previous behavior. Also add a migration guide note. ### Why are the changes needed? The original PR changes the behavior of reading from a data source file with options. The flag is needed to allow users a way to restore the former behavior, if desired. ### Does this PR introduce _any_ user-facing change? No (original PR was a user-facing change, but this PR simply adds a config). ### How was this patch tested? Added test for the config ### Was this patch authored or co-authored using generative AI tooling? No Closes #50571 from asl3/asl3/filedatasourcecache-docsconf. Authored-by: Amanda Liu <amanda.liu@databricks.com> Signed-off-by: Gengliang Wang <gengliang@apache.org>
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -64,6 +64,7 @@ license: |
 - Since Spark 4.0, Views allow control over how they react to underlying query changes. By default views tolerate column type changes in the query and compensate with casts. To disable this feature set `spark.sql.legacy.viewSchemaBindingMode` to `false`. This also removes the clause from `DESCRIBE EXTENDED` and `SHOW CREATE TABLE`.
 - Since Spark 4.0, The Storage-Partitioned Join feature flag `spark.sql.sources.v2.bucketing.pushPartValues.enabled` is set to `true`. To restore the previous behavior, set `spark.sql.sources.v2.bucketing.pushPartValues.enabled` to `false`.
 - Since Spark 4.0, the `sentences` function uses `Locale(language)` instead of `Locale.US` when `language` parameter is not `NULL` and `country` parameter is `NULL`.
+- Since Spark 4.0, reading from a file source table will correctly respect query options, e.g. delimiters. Previously, the first query plan was cached and subsequent option changes ignored. To restore the previous behavior, set `spark.sql.legacy.readFileSourceTableCacheIgnoreOptions` to `true`.
 
 ## Upgrading from Spark SQL 3.5.3 to 3.5.4
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5269,6 +5269,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val READ_FILE_SOURCE_TABLE_CACHE_IGNORE_OPTIONS =
+    buildConf("spark.sql.legacy.readFileSourceTableCacheIgnoreOptions")
+      .internal()
+      .doc("When set to true, reading from file source table caches the first query plan and " +
+        "ignores subsequent changes in query options. Otherwise, query options will be applied " +
+        "to the cached plan and may produce different results.")
+      .version("4.0.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val READ_SIDE_CHAR_PADDING = buildConf("spark.sql.readSideCharPadding")
     .doc("When true, Spark applies string padding when reading CHAR type columns/fields, " +
       "in addition to the write-side padding. This config is true by default to better enforce " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -257,6 +257,8 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       QualifiedTableName(table.identifier.catalog.get, table.database, table.identifier.table)
     val catalog = sparkSession.sessionState.catalog
     val dsOptions = DataSourceUtils.generateDatasourceOptions(extraOptions, table)
+    val readFileSourceTableCacheIgnoreOptions =
+      SQLConf.get.getConf(SQLConf.READ_FILE_SOURCE_TABLE_CACHE_IGNORE_OPTIONS)
     catalog.getCachedTable(qualifiedTableName) match {
       case null =>
         val dataSource =
@@ -274,13 +276,16 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
         catalog.cacheTable(qualifiedTableName, plan)
         plan
 
-      // If the cached table relation's options differ from the new options:
+      // If readFileSourceTableCacheIgnoreOptions is false AND
+      // the cached table relation's options differ from the new options:
       // 1. Create a new HadoopFsRelation with updated options
       // 2. Return a new LogicalRelation with the updated HadoopFsRelation
-      // This ensures the relation reflects any changes in data source options
+      // This ensures the relation reflects any changes in data source options.
+      // Otherwise, leave the cached table relation as is
       case r @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _, _)
-        if new CaseInsensitiveStringMap(fsRelation.options.asJava) !=
-          new CaseInsensitiveStringMap(dsOptions.asJava) =>
+        if !readFileSourceTableCacheIgnoreOptions &&
+          (new CaseInsensitiveStringMap(fsRelation.options.asJava) !=
+          new CaseInsensitiveStringMap(dsOptions.asJava)) =>
         val newFsRelation = fsRelation.copy(options = dsOptions)(sparkSession)
         r.copy(relation = newFsRelation)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
 import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
@@ -1376,32 +1377,84 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase {
     }
   }
 
-  test("SPARK-51747: Data source cached plan should respect options") {
-    withTable("t") {
-      spark.sql("CREATE TABLE t(a string, b string) USING CSV".stripMargin)
-      spark.sql("INSERT INTO TABLE t VALUES ('a;b', 'c')")
-      spark.sql("INSERT INTO TABLE t VALUES ('hello; world', 'test')")
-
-      // check initial contents of table
-      checkAnswer(spark.table("t"), Row("a;b", "c") :: Row("hello; world", "test") :: Nil)
+  test("SPARK-51747: Data source cached plan respects options if ignore conf disabled") {
+    val catalog = spark.sessionState.catalog
 
-      // no option
-      checkAnswer(
-        spark.sql("SELECT * FROM t"),
-        Row("a;b", "c") :: Row("hello; world", "test") :: Nil
-      )
+    // util to get cached table plan options
+    def getCachedTableOptions(
+        qualifiedTableName: QualifiedTableName): Map[String, String] = {
+      catalog.getCachedTable(qualifiedTableName) match {
+        case LogicalRelation(fsRelation: HadoopFsRelation, _, _, _, _) => fsRelation.options
+      }
+    }
 
-      // respect delimiter option
-      checkAnswer(
-        spark.sql("SELECT * FROM t WITH ('delimiter' = ';')"),
-        Row("a", "b,c") :: Row("hello", " world,test") :: Nil
-      )
+    Seq(true, false).foreach { ignoreOption =>
+      withSQLConf(
+        SQLConf.READ_FILE_SOURCE_TABLE_CACHE_IGNORE_OPTIONS.key -> ignoreOption.toString) {
+        withNamespace("ns") {
+          withTable("t") {
+            spark.sql(("CREATE TABLE t(a string, b string) " +
+              "USING CSV OPTIONS (maxColumns 500)").stripMargin)
+            spark.sql("INSERT INTO TABLE t VALUES ('a;b', 'c')")
+            spark.sql("INSERT INTO TABLE t VALUES ('hello; world', 'test')")
+
+            // check initial contents of table
+            val resultNoOptions = Row("a;b", "c") :: Row("hello; world", "test") :: Nil
+            checkAnswer(spark.table("t"), resultNoOptions)
+
+            // check cached plan contains create table options
+            val qualifiedTableName = QualifiedTableName(
+              CatalogManager.SESSION_CATALOG_NAME, catalog.getCurrentDatabase, "t")
+            val pathOption = catalog.getTableMetadata(TableIdentifier("t"))
+              .storage.locationUri.map("path" -> CatalogUtils.URIToString(_))
+            val createTableOptions: Map[String, String] = Map("maxcolumns" -> "500") ++ pathOption
+            assert(getCachedTableOptions(qualifiedTableName) == createTableOptions)
+
+            // delimiter ; option
+            val expectedResultDelimiter =
+              if (ignoreOption) {
+                resultNoOptions
+              } else {
+                Row("a", "b,c") :: Row("hello", " world,test") :: Nil
+              }
+            checkAnswer(
+              spark.sql("SELECT * FROM t WITH ('delimiter' = ';')"),
+              expectedResultDelimiter
+            )
+            checkAnswer(
+              spark.read.option("delimiter", ";").table("t"), // scala API test
+              expectedResultDelimiter
+            )
+            // cached plan should still only contain create table options
+            assert(getCachedTableOptions(qualifiedTableName) == createTableOptions)
 
-      // respect lineSep option
-      checkAnswer(
-        spark.sql("SELECT * FROM t WITH ('lineSep' = ';')"),
-        Row("a", null) :: Row("b", "c\n") :: Row("hello", null) :: Row(" world", "test\n") :: Nil
-      )
+            // no option
+            checkAnswer(
+              spark.sql("SELECT * FROM t"),
+              resultNoOptions
+            )
+            assert(getCachedTableOptions(qualifiedTableName) == createTableOptions)
+
+            // lineSep ; option
+            val expectedResultLineSep =
+              if (ignoreOption) {
+                resultNoOptions
+              } else {
+                Row("a", null) :: Row("b", "c\n") :: Row("hello", null) ::
+                  Row(" world", "test\n") :: Nil
+              }
+            checkAnswer(
+              spark.sql("SELECT * FROM t WITH ('lineSep' = ';')"),
+              expectedResultLineSep
+            )
+            checkAnswer(
+              spark.read.option("lineSep", ";").table("t"), // scala API test
+              expectedResultLineSep
+            )
+            assert(getCachedTableOptions(qualifiedTableName) == createTableOptions)
+          }
+        }
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateWithInitialStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateWithInitialStateSuite.scala
@@ -420,8 +420,12 @@ class FlatMapGroupsWithStateWithInitialStateSuite extends StateStoreMetricsTest
   Seq(true, false).foreach { skipEmittingInitialStateKeys =>
     testWithAllStateVersions("flatMapGroupsWithState - initial state and initial batch " +
       s"have same keys and skipEmittingInitialStateKeys=$skipEmittingInitialStateKeys") {
-      withSQLConf(SQLConf.FLATMAPGROUPSWITHSTATE_SKIP_EMITTING_INITIAL_STATE_KEYS.key ->
-        skipEmittingInitialStateKeys.toString) {
+      withSQLConf(
+        SQLConf.FLATMAPGROUPSWITHSTATE_SKIP_EMITTING_INITIAL_STATE_KEYS.key ->
+        skipEmittingInitialStateKeys.toString,
+        // restore behavior before SPARK-51747
+        SQLConf.READ_FILE_SOURCE_TABLE_CACHE_IGNORE_OPTIONS.key -> "true"
+      ) {
         val initialState = Seq(
           ("apple", 1L),
           ("orange", 2L)).toDS().groupByKey(_._1).mapValues(_._2)