Skip to content

Commit 46ac78e

Browse files
mihailotim-dbcloud-fan
authored andcommitted
[SPARK-53734][SQL] Prefer table column over LCA when resolving array index
### What changes were proposed in this pull request? Prefer table column over LCA when resolving array index. ### Why are the changes needed? For a query like: ``` SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1,2)); ``` the output should be (1,1), but current Spark implementation outputs (1,2). This is because `[col1]` is resolved as an LCA instead of being resolved to a column. This is because we never actually resolve `field` of `UnresolvedExtractValue` in `innerResolve`, so the resolution of `field` fails over to the next item in precedence chain, which is LCA. ### Does this PR introduce _any_ user-facing change? Yes, user now sees the correct result for the impacted query shape ### How was this patch tested? Added test case for the impacted query. ### Was this patch authored or co-authored using generative AI tooling? No Closes #52472 from mihailotim-db/mihailo-timotic_data/array_index_lca_correctness. Authored-by: Mihailo Timotic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 3c8c714 commit 46ac78e

File tree

3 files changed

+29
-3
lines changed

3 files changed

+29
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,12 +167,17 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {
167167
}
168168
}
169169

170-
case u @ UnresolvedExtractValue(child, fieldName) =>
170+
case u @ UnresolvedExtractValue(child, field) =>
171171
val newChild = innerResolve(child, isTopLevel = false)
172+
val resolvedField = if (conf.getConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX)) {
173+
innerResolve(field, isTopLevel = false)
174+
} else {
175+
field
176+
}
172177
if (newChild.resolved) {
173-
ExtractValue(newChild, fieldName, resolver)
178+
ExtractValue(child = newChild, extraction = resolvedField, resolver = resolver)
174179
} else {
175-
u.copy(child = newChild)
180+
u.copy(child = newChild, extraction = resolvedField)
176181
}
177182

178183
case _ => e.mapChildren(innerResolve(_, isTopLevel = false))

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,16 @@ object SQLConf {
241241
}
242242
}
243243

244+
val PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX =
245+
buildConf("spark.sql.analyzer.preferColumnOverLcaInArrayIndex")
246+
.internal()
247+
.doc(
248+
"When true, prefer the column from the underlying relation over the lateral column alias " +
249+
"reference with the same name (see SPARK-53734)."
250+
)
251+
.booleanConf
252+
.createWithDefault(true)
253+
244254
val DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT =
245255
buildConf("spark.sql.analyzer.dontDeduplicateExpressionIfExprIdInOutput")
246256
.internal()

sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5079,6 +5079,17 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
50795079

50805080
checkAnswer(df, Row(1))
50815081
}
5082+
5083+
test("SPARK-53734: Prefer table column over LCA when resolving array index") {
5084+
val query = "SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1, 2));"
5085+
withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "true") {
5086+
checkAnswer(sql(query), Row(1, 1))
5087+
}
5088+
5089+
withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "false") {
5090+
checkAnswer(sql(query), Row(1, 2))
5091+
}
5092+
}
50825093
}
50835094

50845095
case class Foo(bar: Option[String])

0 commit comments

Comments
 (0)