Skip to content

Commit 8b106e9

Browse files
h3n4lclaude
andauthored
feat: add CosmosDB parser to monorepo (#35)
This commit adds the CosmosDB parser from the standalone cosmosdb-parser repository into the unified parser monorepo. Changes: - Added CosmosDB grammar files (CosmosDBLexer.g4, CosmosDBParser.g4) - Added test infrastructure with 5 example SQL files - Created Makefile for building and testing the parser - Updated package names from 'parser' to 'cosmosdb' - Updated import paths to github.com/bytebase/parser/cosmosdb - Updated CI workflow to include cosmosdb in the test matrix - Generated parser files using ANTLR 4 with Go target - All 5 tests passing successfully 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <[email protected]>
1 parent 01783b6 commit 8b106e9

17 files changed

+8119
-1
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
id: set-matrix
3535
run: |
3636
# List of all available parsers
37-
ALL_PARSERS="redshift postgresql cql snowflake tsql doris trino plsql googlesql mysql partiql tidb bq mariadb"
37+
ALL_PARSERS="redshift postgresql cql snowflake tsql doris trino plsql googlesql mysql partiql tidb bq mariadb cosmosdb"
3838
# Add more parsers here as they are added to the repository
3939
# ALL_PARSERS="redshift mysql postgresql"
4040

cosmosdb/CosmosDBLexer.g4

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
lexer grammar CosmosDBLexer;
2+
3+
options {
4+
caseInsensitive = true;
5+
}
6+
7+
fragment A: [a];
8+
fragment B: [b];
9+
fragment C: [c];
10+
fragment D: [d];
11+
fragment E: [e];
12+
fragment F: [f];
13+
fragment G: [g];
14+
fragment H: [h];
15+
fragment I: [i];
16+
fragment J: [j];
17+
fragment K: [k];
18+
fragment L: [l];
19+
fragment M: [m];
20+
fragment N: [n];
21+
fragment O: [o];
22+
fragment P: [p];
23+
fragment Q: [q];
24+
fragment R: [r];
25+
fragment S: [s];
26+
fragment T: [t];
27+
fragment U: [u];
28+
fragment V: [v];
29+
fragment W: [w];
30+
fragment X: [x];
31+
fragment Y: [y];
32+
fragment Z: [z];
33+
34+
MULTIPLY_OPERATOR: '*';
35+
36+
AS_SYMBOL: 'AS';
37+
SELECT_SYMBOL: 'SELECT';
38+
FROM_SYMBOL: 'FROM';
39+
DISTINCT_SYMBOL: 'DISTINCT';
40+
UNDEFINED_SYMBOL: 'UNDEFINED';
41+
NULL_SYMBOL: 'NULL';
42+
FALSE_SYMBOL: 'FALSE';
43+
TRUE_SYMBOL: 'TRUE';
44+
NOT_SYMBOL: 'NOT';
45+
UDF_SYMBOL: 'UDF';
46+
WHERE_SYMBOL: 'WHERE';
47+
AND_SYMBOL: 'AND';
48+
OR_SYMBOL: 'OR';
49+
50+
AT_SYMBOL: '@';
51+
LC_BRACKET_SYMBOL: '{';
52+
RC_BRACKET_SYMBOL: '}';
53+
LS_BRACKET_SYMBOL: '[';
54+
RS_BRACKET_SYMBOL: ']';
55+
LR_BRACKET_SYMBOL: '(';
56+
RR_BRACKET_SYMBOL: ')';
57+
SINGLE_QUOTE_SYMBOL: '\'';
58+
DOUBLE_QUOTE_SYMBOL: '"';
59+
COMMA_SYMBOL: ',';
60+
DOT_SYMBOL: '.';
61+
QUESTION_MARK_SYMBOL: '?';
62+
COLON_SYMBOL: ':';
63+
PLUS_SYMBOL: '+';
64+
MINUS_SYMBOL: '-';
65+
BIT_NOT_SYMBOL: '~';
66+
DIVIDE_SYMBOL: '/';
67+
MODULO_SYMBOL: '%';
68+
BIT_AND_SYMBOL: '&';
69+
BIT_OR_SYMBOL: '|';
70+
DOUBLE_BAR_SYMBOL: '||';
71+
BIT_XOR_SYMBOL: '^';
72+
EQUAL_SYMBOL: '=';
73+
74+
/* Identifiers */
75+
IDENTIFIER: [a-z] [a-z_0-9]*;
76+
77+
// White space handling
78+
WHITESPACE:
79+
[ \t\f\r\n] -> channel(HIDDEN); // Ignore whitespaces.
80+
81+
// Decimal literal.
82+
fragment DEC_DIGIT: [0-9];
83+
fragment DEC_DOT_DEC: (
84+
DEC_DIGIT+ '.' DEC_DIGIT+
85+
| DEC_DIGIT+ '.'
86+
| '.' DEC_DIGIT+
87+
);
88+
89+
DECIMAL: DEC_DIGIT+;
90+
REAL: (DECIMAL | DEC_DOT_DEC) ('E' [+-]? DEC_DIGIT+);
91+
FLOAT: DEC_DOT_DEC;
92+
93+
// Hexadecimal literal.
94+
fragment HEX_DIGIT: [0-9A-F];
95+
HEXADECIMAL: '0' 'X' HEX_DIGIT+;
96+
97+
fragment FullWidthLetter options {
98+
caseInsensitive = false;
99+
}:
100+
'\u00c0' ..'\u00d6'
101+
| '\u00d8' ..'\u00f6'
102+
| '\u00f8' ..'\u00ff'
103+
| '\u0100' ..'\u1fff'
104+
| '\u2c00' ..'\u2fff'
105+
| '\u3040' ..'\u318f'
106+
| '\u3300' ..'\u337f'
107+
| '\u3400' ..'\u3fff'
108+
| '\u4e00' ..'\u9fff'
109+
| '\ua000' ..'\ud7ff'
110+
| '\uf900' ..'\ufaff'
111+
| '\uff00' ..'\ufff0';
112+
// | '\u10000'..'\u1F9FF' //not support four bytes chars | '\u20000'..'\u2FA1F'
113+
114+
// String literal.
115+
fragment ESCAPE_SEQUENCE:
116+
'\\' [btnrf"'\\/] // Basic escape sequences: \b, \t, \n, \r, \f, ", ', \, /
117+
| '\\u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT; // Unicode escape: \uXXXX
118+
119+
fragment STRING_CHAR:
120+
ESCAPE_SEQUENCE
121+
| ~[\\"'\r\n]; // Any Unicode character EXCEPT: \, ", ', \r, \n
122+
123+
// String literals
124+
SINGLE_QUOTE_STRING_LITERAL:
125+
SINGLE_QUOTE_SYMBOL STRING_CHAR* SINGLE_QUOTE_SYMBOL;
126+
127+
128+
DOUBLE_QUOTE_STRING_LITERAL:
129+
DOUBLE_QUOTE_SYMBOL STRING_CHAR* DOUBLE_QUOTE_SYMBOL;

cosmosdb/CosmosDBParser.g4

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
parser grammar CosmosDBParser;
2+
3+
options {
4+
tokenVocab = CosmosDBLexer;
5+
}
6+
7+
root: select EOF;
8+
9+
select: select_clause from_clause where_clause?;
10+
11+
select_clause: SELECT_SYMBOL select_specification;
12+
13+
select_specification:
14+
MULTIPLY_OPERATOR
15+
| DISTINCT_SYMBOL? object_property_list;
16+
17+
from_clause: FROM_SYMBOL from_specification;
18+
19+
where_clause: WHERE_SYMBOL scalar_expression_in_where;
20+
21+
from_specification: from_source;
22+
23+
from_source: container_expression;
24+
25+
container_expression: container_name (AS_SYMBOL? IDENTIFIER)?;
26+
27+
container_name: IDENTIFIER;
28+
29+
object_property_list:
30+
object_property (COMMA_SYMBOL object_property)*;
31+
32+
object_property: scalar_expression (AS_SYMBOL? property_alias)?;
33+
34+
property_alias: IDENTIFIER;
35+
36+
// scalar_expression: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
37+
scalar_expression:
38+
input_alias
39+
| scalar_expression DOT_SYMBOL property_name
40+
| scalar_expression LS_BRACKET_SYMBOL (
41+
(DOUBLE_QUOTE_STRING_LITERAL)
42+
| (array_index)
43+
) RS_BRACKET_SYMBOL
44+
| unary_operator scalar_expression;
45+
46+
// TODO(zp): Merge scalar_expression and scalar_expression_in_where while supporting the project
47+
// fully. https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
48+
scalar_expression_in_where:
49+
constant
50+
| input_alias
51+
| parameter_name
52+
| scalar_expression_in_where AND_SYMBOL scalar_expression_in_where
53+
| scalar_expression_in_where OR_SYMBOL scalar_expression_in_where
54+
| scalar_expression_in_where DOT_SYMBOL property_name
55+
| scalar_expression_in_where LS_BRACKET_SYMBOL (
56+
(DOUBLE_QUOTE_STRING_LITERAL)
57+
| (array_index)
58+
) RS_BRACKET_SYMBOL
59+
| unary_operator scalar_expression_in_where
60+
| scalar_expression_in_where binary_operator scalar_expression_in_where
61+
| scalar_expression_in_where QUESTION_MARK_SYMBOL scalar_expression_in_where COLON_SYMBOL
62+
scalar_expression_in_where
63+
| scalar_function_expression
64+
| create_object_expression
65+
| create_array_expression
66+
| LR_BRACKET_SYMBOL scalar_expression_in_where RR_BRACKET_SYMBOL;
67+
68+
create_array_expression: array_constant;
69+
70+
create_object_expression: object_constant;
71+
72+
scalar_function_expression:
73+
udf_scalar_function_expression
74+
| builtin_function_expression;
75+
76+
udf_scalar_function_expression:
77+
UDF_SYMBOL DOT_SYMBOL IDENTIFIER LR_BRACKET_SYMBOL (
78+
scalar_expression_in_where (
79+
COMMA_SYMBOL scalar_expression_in_where
80+
)*
81+
) RR_BRACKET_SYMBOL;
82+
83+
builtin_function_expression:
84+
IDENTIFIER LR_BRACKET_SYMBOL (
85+
scalar_expression_in_where (
86+
COMMA_SYMBOL scalar_expression_in_where
87+
)*
88+
) RR_BRACKET_SYMBOL;
89+
90+
binary_operator:
91+
MULTIPLY_OPERATOR
92+
| DIVIDE_SYMBOL
93+
| MODULO_SYMBOL
94+
| PLUS_SYMBOL
95+
| MINUS_SYMBOL
96+
| BIT_AND_SYMBOL
97+
| BIT_XOR_SYMBOL
98+
| BIT_OR_SYMBOL
99+
| DOUBLE_BAR_SYMBOL
100+
| EQUAL_SYMBOL;
101+
102+
unary_operator: BIT_NOT_SYMBOL | PLUS_SYMBOL | MINUS_SYMBOL;
103+
104+
parameter_name: AT_SYMBOL IDENTIFIER;
105+
106+
// https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/constants
107+
constant:
108+
undefined_constant
109+
| null_constant
110+
| boolean_constant
111+
| number_constant
112+
| string_constant
113+
| array_constant
114+
| object_constant;
115+
116+
object_constant:
117+
LC_BRACKET_SYMBOL (
118+
object_constant_field_pair (
119+
COMMA_SYMBOL object_constant_field_pair
120+
)*
121+
) RC_BRACKET_SYMBOL;
122+
123+
object_constant_field_pair: (
124+
property_name
125+
| (DOUBLE_QUOTE_SYMBOL property_name DOUBLE_QUOTE_SYMBOL)
126+
) COMMA_SYMBOL constant;
127+
128+
array_constant:
129+
LS_BRACKET_SYMBOL (constant (COMMA_SYMBOL constant)*)? RS_BRACKET_SYMBOL;
130+
131+
string_constant: string_literal;
132+
133+
undefined_constant: UNDEFINED_SYMBOL;
134+
135+
null_constant: NULL_SYMBOL;
136+
137+
boolean_constant: TRUE_SYMBOL | FALSE_SYMBOL;
138+
139+
number_constant: decimal_literal | hexadecimal_literal;
140+
141+
string_literal:
142+
SINGLE_QUOTE_STRING_LITERAL
143+
| DOUBLE_QUOTE_STRING_LITERAL;
144+
145+
decimal_literal: DECIMAL | REAL | FLOAT;
146+
147+
hexadecimal_literal: HEXADECIMAL;
148+
149+
property_name: IDENTIFIER;
150+
151+
array_index: DECIMAL;
152+
153+
input_alias: IDENTIFIER;

cosmosdb/Makefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
all: build test
2+
3+
build:
4+
antlr -Dlanguage=Go -package cosmosdb -visitor -o . CosmosDBLexer.g4 CosmosDBParser.g4
5+
6+
test:
7+
go test -v -run TestCosmosDBParser

cosmosdb/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# cosmosdb-parser
2+
3+
Cosmos DB SQL parser based on ANTLR4.
4+
5+
## References
6+
7+
- [Queries in Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/)

0 commit comments

Comments
 (0)