[SPARK-53734][SQL] Prefer table column over LCA when resolving array index

mihailotim-db · cloud-fan · commit 46ac78ea367c · 2025-09-30T13:23:45.000+08:00
### What changes were proposed in this pull request? Prefer table column over LCA when resolving array index. ### Why are the changes needed? For a query like: ``` SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1,2)); ``` the output should be (1,1), but current Spark implementation outputs (1,2). This is because `[col1]` is resolved as an LCA instead of being resolved to a column. This is because we never actually resolve `field` of `UnresolvedExtractValue` in `innerResolve`, so the resolution of `field` fails over to the next item in precedence chain, which is LCA. ### Does this PR introduce _any_ user-facing change? Yes, user now sees the correct result for the impacted query shape ### How was this patch tested? Added test case for the impacted query. ### Was this patch authored or co-authored using generative AI tooling? No Closes #52472 from mihailotim-db/mihailo-timotic_data/array_index_lca_correctness. Authored-by: Mihailo Timotic <mihailo.timotic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
@@ -167,12 +167,17 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {
           }
         }
 
-        case u @ UnresolvedExtractValue(child, fieldName) =>
+        case u @ UnresolvedExtractValue(child, field) =>
           val newChild = innerResolve(child, isTopLevel = false)
+          val resolvedField = if (conf.getConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX)) {
+            innerResolve(field, isTopLevel = false)
+          } else {
+            field
+          }
           if (newChild.resolved) {
-            ExtractValue(newChild, fieldName, resolver)
+            ExtractValue(child = newChild, extraction = resolvedField, resolver = resolver)
           } else {
-            u.copy(child = newChild)
+            u.copy(child = newChild, extraction = resolvedField)
           }
 
         case _ => e.mapChildren(innerResolve(_, isTopLevel = false))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -241,6 +241,16 @@ object SQLConf {
     }
   }
 
+  val PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX =
+    buildConf("spark.sql.analyzer.preferColumnOverLcaInArrayIndex")
+    .internal()
+    .doc(
+      "When true, prefer the column from the underlying relation over the lateral column alias " +
+      "reference with the same name (see SPARK-53734)."
+    )
+    .booleanConf
+    .createWithDefault(true)
+
   val DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT =
     buildConf("spark.sql.analyzer.dontDeduplicateExpressionIfExprIdInOutput")
     .internal()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -5079,6 +5079,17 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
 
     checkAnswer(df, Row(1))
   }
+
+  test("SPARK-53734: Prefer table column over LCA when resolving array index") {
+    val query = "SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1, 2));"
+    withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "true") {
+      checkAnswer(sql(query), Row(1, 1))
+    }
+
+    withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "false") {
+      checkAnswer(sql(query), Row(1, 2))
+    }
+  }
 }
 
 case class Foo(bar: Option[String])

Original file line number	Diff line number	Diff line change
`@@ -167,12 +167,17 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {`
`167`	`167`	`}`
`168`	`168`	`}`
`169`	`169`
`170`		`- case u @ UnresolvedExtractValue(child, fieldName) =>`
	`170`	`+ case u @ UnresolvedExtractValue(child, field) =>`
`171`	`171`	`val newChild = innerResolve(child, isTopLevel = false)`
	`172`	`+ val resolvedField = if (conf.getConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX)) {`
	`173`	`+ innerResolve(field, isTopLevel = false)`
	`174`	`+ } else {`
	`175`	`+ field`
	`176`	`+ }`
`172`	`177`	`if (newChild.resolved) {`
`173`		`- ExtractValue(newChild, fieldName, resolver)`
	`178`	`+ ExtractValue(child = newChild, extraction = resolvedField, resolver = resolver)`
`174`	`179`	`} else {`
`175`		`- u.copy(child = newChild)`
	`180`	`+ u.copy(child = newChild, extraction = resolvedField)`
`176`	`181`	`}`
`177`	`182`
`178`	`183`	`case _ => e.mapChildren(innerResolve(_, isTopLevel = false))`