Skip to content

Commit 91ed828

Browse files
committed
feat: optimize sparksql grammar
1 parent 6085fdf commit 91ed828

File tree

3 files changed

+130
-101
lines changed

3 files changed

+130
-101
lines changed

src/grammar/spark/SparkSqlLexer.g4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ GTE : '>=' | '!<';
407407
NOT : '!';
408408
PLUS : '+';
409409
MINUS : '-';
410-
ASTERISK : '*';
410+
STAR : '*';
411411
SLASH : '/';
412412
PERCENT : '%';
413413
TILDE : '~';

src/grammar/spark/SparkSqlParser.g4

Lines changed: 77 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,12 @@ statement
7575
| KW_ANALYZE KW_TABLES ((KW_FROM | KW_IN) namespaceName)? KW_COMPUTE KW_STATISTICS (KW_NOSCAN)? # analyzeTables
7676
| KW_ALTER KW_TABLE tableName KW_ADD KW_COLUMN qualifiedColTypeWithPositionForAdd # alterTableAddColumn
7777
| KW_ALTER KW_TABLE tableName KW_ADD KW_COLUMNS LEFT_PAREN qualifiedColTypeWithPositionSeqForAdd RIGHT_PAREN # alterTableAddColumns
78-
| KW_ALTER KW_TABLE table=tableName KW_RENAME KW_COLUMN columnName KW_TO columnNameCreate # renameTableColumn
7978
| KW_ALTER KW_TABLE tableName KW_DROP KW_COLUMN (ifExists)? columnName # alterTableDropColumn
8079
| KW_ALTER KW_TABLE tableName KW_DROP KW_COLUMNS (ifExists)? LEFT_PAREN columnNameSeq RIGHT_PAREN # dropTableColumns
8180
| KW_ALTER (KW_TABLE tableName | KW_VIEW viewName) KW_RENAME KW_TO multipartIdentifier # renameTable
8281
| KW_ALTER (KW_TABLE tableName | KW_VIEW viewName) KW_SET KW_TBLPROPERTIES propertyList # setTableProperties
8382
| KW_ALTER (KW_TABLE tableName | KW_VIEW viewName) KW_UNSET KW_TBLPROPERTIES (ifExists)? propertyList # unsetTableProperties
83+
| KW_ALTER KW_TABLE table=tableName KW_RENAME KW_COLUMN columnName KW_TO columnNameCreate # renameTableColumn
8484
| KW_ALTER KW_TABLE table=tableName (KW_ALTER | KW_CHANGE) KW_COLUMN? column=columnName alterColumnAction? # alterTableAlterColumn
8585
| KW_ALTER KW_TABLE table=tableName partitionSpec? KW_CHANGE KW_COLUMN? colName=columnName columnType colPosition? # hiveChangeColumn
8686
| KW_ALTER KW_TABLE table=tableName partitionSpec? KW_REPLACE KW_COLUMNS LEFT_PAREN qualifiedColTypeWithPositionSeqForReplace RIGHT_PAREN #
@@ -166,17 +166,13 @@ statement
166166
| (KW_MSCK)? KW_REPAIR KW_TABLE tableName (option=(KW_ADD | KW_DROP | KW_SYNC) KW_PARTITIONS)? # repairTable
167167
| op=(KW_ADD | KW_LIST) identifier .*? # manageResource
168168
| KW_SET KW_ROLE .*? # failNativeCommand
169-
| KW_SET KW_TIME KW_ZONE interval # setTimeZoneInterval
170-
| KW_SET KW_TIME KW_ZONE (stringLit | KW_LOCAL) # setTimeZone
171-
| KW_SET KW_TIME KW_ZONE .*? # setTimeZoneAny
169+
| KW_SET KW_TIME KW_ZONE (interval | stringLit | KW_LOCAL | .*?) # setTimeZone
172170
| KW_SET (KW_VARIABLE | KW_VAR) assignmentList # setVariableAssignment
173171
| KW_SET (KW_VARIABLE | KW_VAR) LEFT_PAREN multipartIdentifierList RIGHT_PAREN EQ LEFT_PAREN query RIGHT_PAREN # setVariableMultiAssignment
174-
| KW_SET quotedIdentifier EQ BACKQUOTED_IDENTIFIER # setConfig
172+
| KW_SET (quotedIdentifier | .*?) EQ BACKQUOTED_IDENTIFIER # setConfig
175173
| KW_SET quotedIdentifier (EQ .*?)? # setConfigAndValue
176-
| KW_SET .*? EQ BACKQUOTED_IDENTIFIER # setConfigAnyKey
177174
| KW_SET .*? # setAny
178-
| KW_RESET quotedIdentifier # resetConfig
179-
| KW_RESET .*? # resetAny
175+
| KW_RESET (quotedIdentifier | .*?) # resetConfig
180176
| KW_CREATE KW_INDEX (ifNotExists)? identifier KW_ON KW_TABLE? tableName (
181177
KW_USING indexType=identifier
182178
)? LEFT_PAREN multipartIdentifierPropertyList RIGHT_PAREN (KW_OPTIONS options=propertyList)? # createIndex
@@ -186,40 +182,32 @@ statement
186182
;
187183

188184
unsupportedHiveNativeCommands
189-
: kw1=(KW_CREATE | KW_DROP) kw2=KW_ROLE
190-
| kw1=(KW_GRANT | KW_REVOKE) kw2=KW_ROLE?
191-
| kw1=KW_SHOW kw2=(
192-
KW_GRANT
193-
| KW_PRINCIPALS
194-
| KW_COMPACTIONS
195-
| KW_TRANSACTIONS
196-
| KW_INDEXES
197-
| KW_LOCKS
198-
)
199-
| kw1=KW_SHOW kw2=KW_ROLE kw3=KW_GRANT?
200-
| kw1=KW_SHOW KW_CURRENT? KW_ROLES
201-
| kw1=KW_SHOW kw2=KW_CREATE kw3=KW_TABLE
202-
| kw1=(KW_CREATE | KW_DROP | KW_ALTER) kw2=KW_INDEX
203-
| kw1=(KW_EXPORT | KW_IMPORT | KW_LOCK | KW_UNLOCK) kw2=KW_TABLE
204-
| kw1=(KW_LOCK | KW_UNLOCK) kw2=KW_DATABASE
205-
| kw1=(KW_CREATE | KW_DROP) kw2=KW_TEMPORARY kw3=KW_MACRO
206-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=KW_NOT kw4=(KW_CLUSTERED | KW_SORTED | KW_SKEWED)
207-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=(KW_CLUSTERED | KW_SKEWED) kw4=KW_BY
208-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=KW_SKEWED kw4=KW_BY
209-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=KW_NOT kw4=KW_STORED kw5=KW_AS kw6=KW_DIRECTORIES
210-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=KW_SET kw4=KW_SKEWED kw5=KW_LOCATION
211-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=(KW_EXCHANGE | KW_ARCHIVE | KW_UNARCHIVE) kw4=KW_PARTITION
212-
| kw1=KW_ALTER kw2=KW_TABLE tableName kw3=KW_TOUCH
213-
| kw1=KW_ALTER kw2=KW_TABLE tableName partitionSpec? (
185+
: (KW_CREATE | KW_DROP) KW_ROLE
186+
| (KW_GRANT | KW_REVOKE) KW_ROLE?
187+
| KW_SHOW (KW_GRANT | KW_PRINCIPALS | KW_COMPACTIONS | KW_TRANSACTIONS | KW_INDEXES | KW_LOCKS)
188+
| KW_SHOW KW_ROLE KW_GRANT?
189+
| KW_SHOW KW_CURRENT? KW_ROLES
190+
| KW_SHOW KW_CREATE KW_TABLE
191+
| (KW_CREATE | KW_DROP | KW_ALTER) KW_INDEX
192+
| (KW_EXPORT | KW_IMPORT | KW_LOCK | KW_UNLOCK) KW_TABLE
193+
| (KW_LOCK | KW_UNLOCK) KW_DATABASE
194+
| (KW_CREATE | KW_DROP) KW_TEMPORARY KW_MACRO
195+
| KW_ALTER KW_TABLE tableName KW_NOT (KW_CLUSTERED | KW_SORTED | KW_SKEWED)
196+
| KW_ALTER KW_TABLE tableName (KW_CLUSTERED | KW_SKEWED) KW_BY
197+
| KW_ALTER KW_TABLE tableName KW_NOT KW_STORED KW_AS KW_DIRECTORIES
198+
| KW_ALTER KW_TABLE tableName KW_SET KW_SKEWED KW_LOCATION
199+
| KW_ALTER KW_TABLE tableName (KW_EXCHANGE | KW_ARCHIVE | KW_UNARCHIVE) KW_PARTITION
200+
| KW_ALTER KW_TABLE tableName KW_TOUCH
201+
| KW_ALTER KW_TABLE tableName partitionSpec? (
214202
KW_COMPACT
215203
| KW_CONCATENATE
216204
| (KW_SET KW_FILEFORMAT)
217205
| (KW_REPLACE KW_COLUMNS)
218206
)
219-
| kw1=KW_START kw2=KW_TRANSACTION
220-
| kw1=KW_COMMIT
221-
| kw1=KW_ROLLBACK
222-
| kw1=KW_DFS
207+
| KW_START KW_TRANSACTION
208+
| KW_COMMIT
209+
| KW_ROLLBACK
210+
| KW_DFS
223211
;
224212

225213
bucketSpec
@@ -430,8 +418,8 @@ identifierReference
430418
;
431419

432420
queryOrganization
433-
: (KW_ORDER KW_BY orderOrSortByClause)? (KW_CLUSTER KW_BY clusterOrDistributeBy)? (
434-
KW_DISTRIBUTE KW_BY clusterOrDistributeBy
421+
: (KW_ORDER KW_BY orderOrSortByClause)? (KW_CLUSTER KW_BY expressionSeq)? (
422+
KW_DISTRIBUTE KW_BY expressionSeq
435423
)? (KW_SORT KW_BY orderOrSortByClause)? windowClause? limitClause? (
436424
KW_OFFSET offset=expression
437425
)?
@@ -445,22 +433,16 @@ orderOrSortByClause
445433
: sortItem (COMMA sortItem)*
446434
;
447435

448-
clusterOrDistributeBy
449-
: expression (COMMA expression)*
450-
;
451-
452436
queryTerm
453437
: queryPrimary
454438
| left=queryTerm operator=(KW_INTERSECT | KW_UNION | KW_EXCEPT | KW_MINUS) setQuantifier? right=queryTerm
455-
| left=queryTerm operator=KW_INTERSECT setQuantifier? right=queryTerm
456-
| left=queryTerm operator=(KW_UNION | KW_EXCEPT | KW_MINUS) setQuantifier? right=queryTerm
457439
;
458440

459441
queryPrimary
460442
: querySpecification
461443
| fromClause fromStatementBody+
462444
| KW_TABLE tableName
463-
| KW_VALUES expression (COMMA expression)* tableAlias
445+
| KW_VALUES expressionSeq tableAlias
464446
| LEFT_PAREN query RIGHT_PAREN
465447
;
466448

@@ -476,8 +458,7 @@ fromStatementBody
476458
;
477459

478460
querySpecification
479-
: transformClause fromClause? lateralView* whereClause? aggregationClause? havingClause? windowClause?
480-
| selectClause fromClause? lateralView* whereClause? aggregationClause? havingClause? windowClause?
461+
: (transformClause | selectClause) fromClause? lateralView* whereClause? aggregationClause? havingClause? windowClause?
481462
;
482463

483464
transformClause
@@ -505,7 +486,7 @@ setClause
505486
matchedClause
506487
: KW_WHEN KW_MATCHED (KW_AND matchedCond=booleanExpression)? KW_THEN (
507488
KW_DELETE
508-
| KW_UPDATE KW_SET (ASTERISK | assignmentList)
489+
| KW_UPDATE KW_SET (STAR | assignmentList)
509490
)
510491
;
511492

@@ -521,7 +502,7 @@ notMatchedBySourceClause
521502
;
522503

523504
notMatchedAction
524-
: KW_INSERT ASTERISK
505+
: KW_INSERT STAR
525506
| KW_INSERT LEFT_PAREN multipartIdentifierList RIGHT_PAREN KW_VALUES LEFT_PAREN expression (
526507
COMMA expression
527508
)* RIGHT_PAREN
@@ -548,10 +529,9 @@ hint
548529
;
549530

550531
hintStatement
551-
: hintName=identifier
552-
| hintName=identifier LEFT_PAREN parameters+=primaryExpression (
553-
COMMA parameters+=primaryExpression
554-
)* RIGHT_PAREN
532+
: hintName=identifier (
533+
LEFT_PAREN parameters+=primaryExpression (COMMA parameters+=primaryExpression)* RIGHT_PAREN
534+
)?
555535
;
556536

557537
fromClause
@@ -645,7 +625,7 @@ ifExists
645625
;
646626

647627
lateralView
648-
: KW_LATERAL KW_VIEW (KW_OUTER)? viewName LEFT_PAREN (expression (COMMA expression)*)? RIGHT_PAREN tableAlias (
628+
: KW_LATERAL KW_VIEW (KW_OUTER)? viewName LEFT_PAREN expressionSeq? RIGHT_PAREN tableAlias (
649629
KW_AS? colName+=identifier (COMMA colName+=identifier)*
650630
)?
651631
;
@@ -661,16 +641,15 @@ relation
661641
;
662642

663643
joinRelation
664-
: (joinType) KW_JOIN KW_LATERAL? right=relationPrimary joinCriteria?
665-
| KW_NATURAL joinType KW_JOIN KW_LATERAL? right=relationPrimary
644+
: (joinType) KW_JOIN KW_LATERAL? relationPrimary joinCriteria?
645+
| KW_NATURAL joinType KW_JOIN KW_LATERAL? relationPrimary
666646
;
667647

668648
joinType
669649
: KW_INNER?
670650
| KW_CROSS
671-
| KW_LEFT KW_OUTER?
672651
| KW_LEFT? (KW_SEMI | KW_ANTI)
673-
| (KW_RIGHT | KW_FULL) KW_OUTER?
652+
| (KW_LEFT | KW_RIGHT | KW_FULL) KW_OUTER?
674653
;
675654

676655
joinCriteria
@@ -686,11 +665,10 @@ sample
686665

687666
sampleMethod
688667
: negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) KW_PERCENTLIT
689-
| expression KW_ROWS
668+
| bytes=expression KW_ROWS?
690669
| sampleType=KW_BUCKET numerator=INTEGER_VALUE KW_OUT KW_OF denominator=INTEGER_VALUE (
691670
KW_ON (identifier | qualifiedName LEFT_PAREN RIGHT_PAREN)
692671
)?
693-
| bytes=expression
694672
;
695673

696674
identifierList
@@ -719,16 +697,14 @@ identifierComment
719697

720698
relationPrimary
721699
: (tableName | viewName | identifierReference) temporalClause? sample? tableAlias
722-
| LEFT_PAREN query RIGHT_PAREN sample? tableAlias
723-
| LEFT_PAREN relation RIGHT_PAREN sample? tableAlias
724-
| KW_VALUES expression (COMMA expression)* tableAlias
700+
| LEFT_PAREN (query | relation) RIGHT_PAREN sample? tableAlias
701+
| KW_VALUES expressionSeq tableAlias
725702
| functionName LEFT_PAREN (functionTableArgument (COMMA functionTableArgument)*)? RIGHT_PAREN tableAlias
726703
;
727704

728705
functionTableSubqueryArgument
729706
: KW_TABLE tableName tableArgumentPartitioning?
730-
| KW_TABLE LEFT_PAREN tableName RIGHT_PAREN tableArgumentPartitioning?
731-
| KW_TABLE LEFT_PAREN query RIGHT_PAREN tableArgumentPartitioning?
707+
| KW_TABLE LEFT_PAREN (tableName | query) RIGHT_PAREN tableArgumentPartitioning?
732708
;
733709

734710
tableArgumentPartitioning
@@ -740,7 +716,7 @@ tableArgumentPartitioning
740716
| partition+=expression
741717
)
742718
)
743-
) ((KW_ORDER | KW_SORT) KW_BY ( ((LEFT_PAREN orderOrSortByClause RIGHT_PAREN) | sortItem)))?
719+
) ((KW_ORDER | KW_SORT) KW_BY (((LEFT_PAREN orderOrSortByClause RIGHT_PAREN) | sortItem)))?
744720
;
745721

746722
functionTableNamedArgumentExpression
@@ -846,33 +822,37 @@ booleanExpression
846822
: (KW_NOT | NOT) booleanExpression
847823
| KW_EXISTS LEFT_PAREN query RIGHT_PAREN
848824
| valueExpression predicate?
849-
| left=booleanExpression operator=KW_AND right=booleanExpression
850-
| left=booleanExpression operator=KW_OR right=booleanExpression
825+
| left=booleanExpression operator=(KW_AND | KW_OR) right=booleanExpression
851826
;
852827

853828
predicate
854829
: KW_NOT? kind=KW_BETWEEN lower=valueExpression KW_AND upper=valueExpression
855-
| KW_NOT? kind=KW_IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
856-
| KW_NOT? kind=KW_IN LEFT_PAREN query RIGHT_PAREN
830+
| KW_NOT? kind=KW_IN LEFT_PAREN (expressionSeq | query) RIGHT_PAREN
857831
| KW_NOT? kind=(KW_RLIKE | KW_REGEXP) pattern=valueExpression
858832
| KW_NOT? kind=(KW_LIKE | KW_ILIKE) quantifier=(KW_ANY | KW_SOME | KW_ALL) (
859833
LEFT_PAREN RIGHT_PAREN
860-
| LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
834+
| LEFT_PAREN expressionSeq RIGHT_PAREN
861835
)
862836
| KW_NOT? kind=(KW_LIKE | KW_ILIKE) pattern=valueExpression (KW_ESCAPE escapeChar=stringLit)?
863-
| KW_IS KW_NOT? kind=KW_NULL
864-
| KW_IS KW_NOT? kind=(KW_TRUE | KW_FALSE | KW_UNKNOWN)
837+
| KW_IS KW_NOT? kind=(KW_NULL | KW_TRUE | KW_FALSE | KW_UNKNOWN)
865838
| KW_IS KW_NOT? kind=KW_DISTINCT KW_FROM right=valueExpression
866839
;
867840

868841
valueExpression
869842
: primaryExpression
870843
| operator=(MINUS | PLUS | TILDE) valueExpression
871-
| left=valueExpression operator=(ASTERISK | SLASH | PERCENT | KW_DIV) right=valueExpression
872-
| left=valueExpression operator=(PLUS | MINUS | CONCAT_PIPE) right=valueExpression
873-
| left=valueExpression operator=AMPERSAND right=valueExpression
874-
| left=valueExpression operator=HAT right=valueExpression
875-
| left=valueExpression operator=PIPE right=valueExpression
844+
| left=valueExpression operator=(
845+
STAR
846+
| SLASH
847+
| PERCENT
848+
| KW_DIV
849+
| PLUS
850+
| MINUS
851+
| CONCAT_PIPE
852+
| AMPERSAND
853+
| HAT
854+
| PIPE
855+
) right=valueExpression
876856
| left=valueExpression comparisonOperator right=valueExpression
877857
;
878858

@@ -892,25 +872,23 @@ datetimeUnit
892872

893873
primaryExpression
894874
: name=(KW_CURRENT_DATE | KW_CURRENT_TIMESTAMP | KW_CURRENT_USER | KW_USER | KW_SESSION_USER)
895-
| name=(KW_TIMESTAMPADD | KW_DATEADD | KW_DATE_ADD) LEFT_PAREN (
896-
unit=datetimeUnit
897-
| invalidUnit=stringLit
898-
) COMMA unitsAmount=valueExpression COMMA timestamp=valueExpression RIGHT_PAREN
899-
| name=(KW_TIMESTAMPDIFF | KW_DATEDIFF | KW_DATE_DIFF | KW_TIMEDIFF) LEFT_PAREN (
900-
unit=datetimeUnit
901-
| invalidUnit=stringLit
902-
) COMMA startTimestamp=valueExpression COMMA endTimestamp=valueExpression RIGHT_PAREN
903-
| KW_CASE whenClause+ (KW_ELSE elseExpression=expression)? KW_END
904-
| KW_CASE expression whenClause+ (KW_ELSE elseExpression=expression)? KW_END
875+
| name=(
876+
KW_TIMESTAMPADD
877+
| KW_DATEADD
878+
| KW_DATE_ADD
879+
| KW_TIMESTAMPDIFF
880+
| KW_DATEDIFF
881+
| KW_DATE_DIFF
882+
| KW_TIMEDIFF
883+
) LEFT_PAREN (unit=datetimeUnit | invalidUnit=stringLit) COMMA valueExpression COMMA valueExpression RIGHT_PAREN
884+
| KW_CASE expression? whenClause+ (KW_ELSE elseExpression=expression)? KW_END
905885
| name=(KW_CAST | KW_TRY_CAST) LEFT_PAREN expression KW_AS dataType RIGHT_PAREN
906-
| KW_STRUCT LEFT_PAREN (namedExpression (COMMA namedExpression)*)? RIGHT_PAREN
907-
| KW_FIRST LEFT_PAREN expression (KW_IGNORE KW_NULLS)? RIGHT_PAREN
908-
| KW_ANY_VALUE LEFT_PAREN expression (KW_IGNORE KW_NULLS)? RIGHT_PAREN
909-
| KW_LAST LEFT_PAREN expression (KW_IGNORE KW_NULLS)? RIGHT_PAREN
886+
| KW_STRUCT LEFT_PAREN namedExpressionSeq? RIGHT_PAREN
887+
| (KW_FIRST | KW_ANY_VALUE | KW_LAST) LEFT_PAREN expression (KW_IGNORE KW_NULLS)? RIGHT_PAREN
910888
| KW_POSITION LEFT_PAREN substr=valueExpression KW_IN str=valueExpression RIGHT_PAREN
911889
| constant
912-
| ASTERISK
913-
| qualifiedName DOT ASTERISK
890+
| STAR
891+
| qualifiedName DOT STAR
914892
| LEFT_PAREN namedExpression (COMMA namedExpression)+ RIGHT_PAREN
915893
| LEFT_PAREN query RIGHT_PAREN
916894
| KW_IDENTIFIER LEFT_PAREN expression RIGHT_PAREN
@@ -974,7 +952,7 @@ comparisonOperator
974952
arithmeticOperator
975953
: PLUS
976954
| MINUS
977-
| ASTERISK
955+
| STAR
978956
| SLASH
979957
| PERCENT
980958
| KW_DIV
@@ -1188,14 +1166,13 @@ windowSpec
11881166
* https://github.com/tunnelvisionlabs/antlr4ts/issues/417
11891167
*/
11901168
windowFrame
1191-
: frameType=(KW_RANGE | KW_ROWS) start_=frameBound
1192-
| frameType=(KW_RANGE | KW_ROWS) KW_BETWEEN start_=frameBound KW_AND end=frameBound
1169+
: frameType=(KW_RANGE | KW_ROWS) start=frameBound
1170+
| frameType=(KW_RANGE | KW_ROWS) KW_BETWEEN start=frameBound KW_AND end=frameBound
11931171
;
11941172

11951173
frameBound
1196-
: KW_UNBOUNDED boundType=(KW_PRECEDING | KW_FOLLOWING)
1174+
: (KW_UNBOUNDED | expression) boundType=(KW_PRECEDING | KW_FOLLOWING)
11971175
| boundType=KW_CURRENT KW_ROW
1198-
| expression boundType=(KW_PRECEDING | KW_FOLLOWING)
11991176
;
12001177

12011178
qualifiedNameList

0 commit comments

Comments
 (0)