Skip to content

Commit 6e4d75a

Browse files
milastdbxcloud-fan
authored andcommitted
[SPARK-43980][SQL] introducing select * except syntax
### What changes were proposed in this pull request? Changing parser to support new syntax when using * to fetch columns from source. Introducing new expression `UnresolvedStarExcept` in visit method when newly introduced syntax is created. Expansion of this expression is core logic of feature. ### Why are the changes needed? Introducing new `SELECT * EXCEPT (col1, col2)` ### Does this PR introduce _any_ user-facing change? Yes, this PR introduces new SQL syntax, which is used to explicitly exclude columns from star projection. ### How was this patch tested? Unit tests. Generated new golden files. ### Was this patch authored or co-authored using generative AI tooling? No Closes #43843 from milastdbx/feature/selectStarExcept. Authored-by: milastdbx <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 4304663 commit 6e4d75a

File tree

8 files changed

+929
-8
lines changed

8 files changed

+929
-8
lines changed

common/utils/src/main/resources/error/error-classes.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,18 @@
993993
],
994994
"sqlState" : "54006"
995995
},
996+
"EXCEPT_NESTED_COLUMN_INVALID_TYPE" : {
997+
"message" : [
998+
"EXCEPT column <columnName> was resolved and expected to be StructType, but found type <dataType>."
999+
],
1000+
"sqlState" : "428H2"
1001+
},
1002+
"EXCEPT_OVERLAPPING_COLUMNS" : {
1003+
"message" : [
1004+
"Columns in an EXCEPT list must be distinct and non-overlapping, but got (<columns>)."
1005+
],
1006+
"sqlState" : "42702"
1007+
},
9961008
"EXPECT_PERMANENT_VIEW_NOT_TEMP" : {
9971009
"message" : [
9981010
"'<operation>' expects a permanent view but <viewName> is a temp view."

docs/sql-error-conditions.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,18 @@ The event time `<eventName>` has the invalid type `<eventType>`, but expected "T
587587

588588
Exceeds char/varchar type length limitation: `<limit>`.
589589

590+
### EXCEPT_NESTED_COLUMN_INVALID_TYPE
591+
592+
[SQLSTATE: 428H2](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
593+
594+
EXCEPT column `<columnName>` was resolved and expected to be StructType, but found type `<dataType>`.
595+
596+
### EXCEPT_OVERLAPPING_COLUMNS
597+
598+
[SQLSTATE: 42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
599+
600+
Columns in an EXCEPT list must be distinct and non-overlapping, but got (`<columns>`).
601+
590602
### EXPECT_PERMANENT_VIEW_NOT_TEMP
591603

592604
[SQLSTATE: 42809](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,10 @@ notMatchedBySourceAction
587587
| UPDATE SET assignmentList
588588
;
589589

590+
exceptClause
591+
: EXCEPT LEFT_PAREN exceptCols=multipartIdentifierList RIGHT_PAREN
592+
;
593+
590594
assignmentList
591595
: assignment (COMMA assignment)*
592596
;
@@ -969,8 +973,8 @@ primaryExpression
969973
| LAST LEFT_PAREN expression (IGNORE NULLS)? RIGHT_PAREN #last
970974
| POSITION LEFT_PAREN substr=valueExpression IN str=valueExpression RIGHT_PAREN #position
971975
| constant #constantDefault
972-
| ASTERISK #star
973-
| qualifiedName DOT ASTERISK #star
976+
| ASTERISK exceptClause? #star
977+
| qualifiedName DOT ASTERISK exceptClause? #star
974978
| LEFT_PAREN namedExpression (COMMA namedExpression)+ RIGHT_PAREN #rowConstructor
975979
| LEFT_PAREN query RIGHT_PAREN #subqueryExpression
976980
| functionName LEFT_PAREN (setQuantifier? argument+=functionArgument

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala

Lines changed: 151 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.analysis
1919

20+
import org.apache.spark.SparkException
2021
import org.apache.spark.sql.AnalysisException
2122
import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
2223
import org.apache.spark.sql.catalyst.expressions._
@@ -358,20 +359,23 @@ abstract class Star extends LeafExpression with NamedExpression {
358359
def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression]
359360
}
360361

361-
362362
/**
363363
* Represents all of the input attributes to a given relational operator, for example in
364364
* "SELECT * FROM ...".
365+
* "SELECT * FROM ..." or "SELECT * EXCEPT(...) FROM ..."
365366
*
366367
* This is also used to expand structs. For example:
367368
* "SELECT record.* from (SELECT struct(a,b,c) as record ...)
368369
*
369370
* @param target an optional name that should be the target of the expansion. If omitted all
370371
* targets' columns are produced. This can either be a table name or struct name. This
371372
* is a list of identifiers that is the path of the expansion.
372-
*/
373-
case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevaluable {
374-
373+
*
374+
* This class provides the shared behavior between the classes for SELECT * ([[UnresolvedStar]])
375+
* and SELECT * EXCEPT ([[UnresolvedStarExcept]]). [[UnresolvedStar]] is just a case class of this,
376+
* while [[UnresolvedStarExcept]] adds some additional logic to the expand method.
377+
*/
378+
abstract class UnresolvedStarBase(target: Option[Seq[String]]) extends Star with Unevaluable {
375379
/**
376380
* Returns true if the nameParts is a subset of the last elements of qualifier of the attribute.
377381
*
@@ -383,7 +387,7 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
383387
* - `SELECT t.* FROM ns1.ns2.t` where nameParts is Seq("t") and
384388
* qualifier is Seq("ns1", "ns2", "t").
385389
*/
386-
private def matchedQualifier(
390+
protected def matchedQualifier(
387391
attribute: Attribute,
388392
nameParts: Seq[String],
389393
resolver: Resolver): Boolean = {
@@ -444,6 +448,148 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
444448
override def toString: String = target.map(_.mkString("", ".", ".")).getOrElse("") + "*"
445449
}
446450

451+
/**
452+
* Represents some of the input attributes to a given relational operator, for example in
453+
* "SELECT * EXCEPT(a) FROM ...".
454+
*
455+
* @param target an optional name that should be the target of the expansion. If omitted all
456+
* targets' columns are produced. This can only be a table name. This
457+
* is a list of identifiers that is the path of the expansion.
458+
*
459+
* @param excepts a list of names that should be excluded from the expansion.
460+
*
461+
*/
462+
case class UnresolvedStarExcept(target: Option[Seq[String]], excepts: Seq[Seq[String]])
463+
extends UnresolvedStarBase(target) {
464+
465+
/**
466+
* We expand the * EXCEPT by the following three steps:
467+
* 1. use the original .expand() to get top-level column list or struct expansion
468+
* 2. resolve excepts (with respect to the Seq[NamedExpression] returned from (1))
469+
* 3. filter the expanded columns with the resolved except list. recursively apply filtering in
470+
* case of nested columns in the except list (in order to rewrite structs)
471+
*/
472+
override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = {
473+
// Use the UnresolvedStarBase expand method to get a seq of NamedExpressions corresponding to
474+
// the star expansion. This will yield a list of top-level columns from the logical plan's
475+
// output, or in the case of struct expansion (e.g. target=`x` for SELECT x.*) it will give
476+
// a seq of Alias wrapping the struct field extraction.
477+
val expandedCols = super.expand(input, resolver)
478+
479+
// resolve except list with respect to the expandedCols
480+
val resolvedExcepts = excepts.map { exceptParts =>
481+
AttributeSeq(expandedCols.map(_.toAttribute)).resolve(exceptParts, resolver).getOrElse {
482+
val orderedCandidates = StringUtils.orderSuggestedIdentifiersBySimilarity(
483+
UnresolvedAttribute(exceptParts).name, expandedCols.map(a => a.qualifier :+ a.name))
484+
// if target is defined and expandedCols does not include any Attributes, it must be struct
485+
// expansion; give message suggesting to use unqualified names of nested fields.
486+
throw QueryCompilationErrors
487+
.unresolvedColumnError(UnresolvedAttribute(exceptParts).name, orderedCandidates)
488+
}
489+
}
490+
491+
// Convert each resolved except into a pair of (col: Attribute, nestedColumn) representing the
492+
// top level column in expandedCols that we must 'filter' based on nestedColumn.
493+
@scala.annotation.tailrec
494+
def getRootColumn(expr: Expression, nestedColumn: Seq[String] = Nil)
495+
: (NamedExpression, Seq[String]) = expr match {
496+
case GetStructField(fieldExpr, _, Some(fieldName)) =>
497+
getRootColumn(fieldExpr, fieldName +: nestedColumn)
498+
case e: NamedExpression => e -> nestedColumn
499+
case other: ExtractValue => throw new AnalysisException(
500+
errorClass = "EXCEPT_NESTED_COLUMN_INVALID_TYPE",
501+
messageParameters = Map("columnName" -> other.sql, "dataType" -> other.dataType.toString))
502+
}
503+
// An exceptPair represents the column in expandedCols along with the path of a nestedColumn
504+
// that should be except-ed. Consider two examples:
505+
// 1. excepting the entire col1 = (col1, Seq())
506+
// 2. excepting a nested field in col2, col2.a.b = (col2, Seq(a, b))
507+
// INVARIANT: we rely on the structure of the resolved except being an Alias of GetStructField
508+
// in the case of nested columns.
509+
val exceptPairs = resolvedExcepts.map {
510+
case Alias(exceptExpr, name) => getRootColumn(exceptExpr)
511+
case except: NamedExpression => except -> Seq.empty
512+
}
513+
514+
// Filter columns which correspond to ones listed in the except list and return a new list of
515+
// columns which exclude the columns in the except list. The 'filtering' manifests as either
516+
// dropping the column from the list of columns we return, or rewriting the projected column in
517+
// order to remove excepts that refer to nested columns. For example, with the example above:
518+
// excepts = Seq(
519+
// (col1, Seq()), => filter col1 from the output
520+
// (col2, Seq(a, b)) => rewrite col2 in the output so that it doesn't include the nested field
521+
// ) corresponding to col2.a.b
522+
//
523+
// This occurs in two steps:
524+
// 1. group the excepts by the column they refer to (groupedExcepts)
525+
// 2. filter/rewrite input columns based on four cases:
526+
// a. column doesn't match any groupedExcepts => column unchanged
527+
// b. column exists in groupedExcepts and:
528+
// i. none of remainingExcepts are empty => recursively apply filterColumns over the
529+
// struct fields in order to rewrite the struct
530+
// ii. a remainingExcept is empty, but there are multiple remainingExcepts => we must
531+
// have duplicate/overlapping excepts - throw an error
532+
// iii. [otherwise] remainingExcept is exactly Seq(Seq()) => this is the base 'filtering'
533+
// case. we omit the column from the output (this is a column we would like to
534+
// except). NOTE: this case isn't explicitly listed in the `collect` below since we
535+
// 'collect' columns which match the cases above and omit ones that fall into this
536+
// remaining case.
537+
def filterColumns(columns: Seq[NamedExpression], excepts: Seq[(NamedExpression, Seq[String])])
538+
: Seq[NamedExpression] = {
539+
// group the except pairs by the column they refer to. NOTE: no groupMap until scala 2.13
540+
val groupedExcepts: AttributeMap[Seq[Seq[String]]] =
541+
AttributeMap(excepts.groupBy(_._1.toAttribute).view.mapValues(v => v.map(_._2)))
542+
543+
// map input columns while searching for the except entry corresponding to the current column
544+
columns.map(col => col -> groupedExcepts.get(col.toAttribute)).collect {
545+
// pass through columns that don't match anything in groupedExcepts
546+
case (col, None) => col
547+
// found a match but nestedExcepts has remaining excepts - recurse to rewrite the struct
548+
case (col, Some(nestedExcepts)) if nestedExcepts.forall(_.nonEmpty) =>
549+
val fields = col.dataType match {
550+
case s: StructType => s.fields
551+
// we shouldn't be here since we EXCEPT_NEXTED_COLUMN_INVALID_TYPE in getRootColumn
552+
// for this case - just throw internal error
553+
case _ => throw SparkException.internalError("Invalid column type")
554+
}
555+
val extractedFields = fields.zipWithIndex.map { case (f, i) =>
556+
Alias(GetStructField(col, i), f.name)()
557+
}
558+
val newExcepts = nestedExcepts.map { nestedExcept =>
559+
// INVARIANT: we cannot have duplicate column names in nested columns, thus, this `head`
560+
// will find the one and only column corresponding to the correct extractedField.
561+
extractedFields.collectFirst { case col if resolver(col.name, nestedExcept.head) =>
562+
col.toAttribute -> nestedExcept.tail
563+
}.get
564+
}
565+
Alias(CreateStruct(filterColumns(extractedFields.toSeq, newExcepts)), col.name)()
566+
// if there are multiple nestedExcepts but one is empty we must have overlapping except
567+
// columns. throw an error.
568+
case (col, Some(nestedExcepts)) if nestedExcepts.size > 1 =>
569+
throw new AnalysisException(
570+
errorClass = "EXCEPT_OVERLAPPING_COLUMNS",
571+
messageParameters = Map(
572+
"columns" -> this.excepts.map(_.mkString(".")).mkString(", ")))
573+
}
574+
}
575+
576+
filterColumns(expandedCols, exceptPairs)
577+
}
578+
}
579+
580+
/**
581+
* Represents all of the input attributes to a given relational operator, for example in
582+
* "SELECT * FROM ...".
583+
*
584+
* This is also used to expand structs. For example:
585+
* "SELECT record.* from (SELECT struct(a,b,c) as record ...)
586+
*
587+
* @param target an optional name that should be the target of the expansion. If omitted all
588+
* targets' columns are produced. This can either be a table name or struct name. This
589+
* is a list of identifiers that is the path of the expansion.
590+
*/
591+
case class UnresolvedStar(target: Option[Seq[String]]) extends UnresolvedStarBase(target)
592+
447593
/**
448594
* Represents all of the input attributes to a given relational operator, for example in
449595
* "SELECT `(id)?+.+` FROM ...".

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1786,7 +1786,27 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging {
17861786
* Both un-targeted (global) and targeted aliases are supported.
17871787
*/
17881788
override def visitStar(ctx: StarContext): Expression = withOrigin(ctx) {
1789-
UnresolvedStar(Option(ctx.qualifiedName()).map(_.identifier.asScala.map(_.getText).toSeq))
1789+
var target = Option(ctx.qualifiedName()).map(_.identifier.asScala.map(_.getText).toSeq)
1790+
1791+
if (ctx.exceptClause != null) {
1792+
visitStarExcept(ctx, target)
1793+
}
1794+
else {
1795+
UnresolvedStar(target)
1796+
}
1797+
}
1798+
1799+
/**
1800+
* Create a star-except (i.e. all - except list) expression; this selects all elements in the
1801+
* specified object except those in the except list.
1802+
* Both un-targeted (global) and targeted aliases are supported.
1803+
*/
1804+
def visitStarExcept(ctx: StarContext, target: Option[Seq[String]]): Expression = withOrigin(ctx) {
1805+
val exceptCols = ctx.exceptClause
1806+
.exceptCols.multipartIdentifier.asScala.map(typedVisit[Seq[String]])
1807+
UnresolvedStarExcept(
1808+
target,
1809+
exceptCols.toSeq)
17901810
}
17911811

17921812
/**

0 commit comments

Comments
 (0)