Skip to content

Commit a2d35ad

Browse files
committed
code and tests
1 parent acac128 commit a2d35ad

File tree

109 files changed

+13969
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+13969
-0
lines changed

README.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,78 @@
11
# MindsDB SQL Parser 🚧
2+
3+
4+
# Installation
5+
6+
```
7+
pip install mindsdb_parser
8+
```
9+
10+
## How to use
11+
12+
```python
13+
14+
from mindsdb_parser import parse_sql
15+
16+
query = parse_sql('select b from aaa where c=1')
17+
18+
# result is abstract syntax tree (AST)
19+
query
20+
21+
# string representation of AST
22+
query.to_tree()
23+
24+
# representation of tree as sql string. it can not exactly match with original sql
25+
query.to_string()
26+
27+
```
28+
29+
## Architecture
30+
31+
For parsing is used [SLY](https://sly.readthedocs.io/en/latest/sly.html) library.
32+
33+
Parsing consists of 2 stages, (separate module for every dialect):
34+
- Defining keywords in lexer.py module. It is made mostly with regexp
35+
- Defining syntax rules in parser.py module. It is made by describing rules in [BNF grammar](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form)
36+
- Syntax is defined in decorator of function. Inside of decorator you can use keyword itself or other function from parser
37+
- Output of function can be used as input in other functions of parser
38+
- Outputs of the parser is listed in "Top-level statements". It has to be Abstract syntax tree (AST) object.
39+
40+
SLY does not support inheritance, therefore every dialect is described completely, without extension one from another.
41+
42+
### [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
43+
- Structure of AST is defined in separate modules (in parser/ast/).
44+
- It can be inherited
45+
- Every class have to have these methods:
46+
- to_tree - to return hierarchical representation of object
47+
- get_string - to return object as sql expression (or sub-expression)
48+
- copy - to copy AST-tree to new object
49+
50+
### Error handling
51+
52+
For better user experience parsing error contains useful information about problem location and possible solution to solve it.
53+
1. it shows location of error if
54+
- character isn't parsed (by lexer)
55+
- token is unexpected (by parser)
56+
2. it tries to propose correct token instead (or before) error location. Possible options
57+
- Keyword will be showed as is.
58+
- '[number]' - if float and integer is expected
59+
- '[string]' - if string is expected
60+
- '[identifier]' - if name of the objects is expected. For example, they are bold words here:
61+
- "select **x** as **name** from **tbl1** where **col**=1"
62+
63+
How suggestion works:
64+
It uses next possible tokens defined by syntax rules.
65+
If this is the end of the query: just shows these tokens.
66+
Else:
67+
- it tries to replace bad token with other token from list of possible tokens
68+
- tries to parse query once again, if there is no error:
69+
- add this token to suggestion list
70+
- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation.
71+
72+
73+
# How to test
74+
75+
```bash
76+
pip install -r requierements_test.txt
77+
env PYTHONPATH=./ pytest
78+
```

mindsdb_parser/__about__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
__title__ = 'mindsdb_parser'
2+
__package_name__ = 'mindsdb_parser'
3+
__version__ = '0.0.1'
4+
__description__ = "Mindsdb SQL parser"
5+
__email__ = "[email protected]"
6+
__author__ = 'MindsDB Inc'
7+
__github__ = 'https://github.com/mindsdb/mindsdb_sql_parser'
8+
__pypi__ = 'https://pypi.org/project/mindsdb_parser'
9+
__license__ = 'MIT'
10+
__copyright__ = 'Copyright 2021- mindsdb'

mindsdb_parser/__init__.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import re
2+
from collections import defaultdict
3+
4+
from sly.lex import Token
5+
6+
from mindsdb_parser.exceptions import ParsingException
7+
from mindsdb_parser.ast import *
8+
9+
10+
class ErrorHandling:
11+
12+
def __init__(self, lexer, parser):
13+
self.parser = parser
14+
self.lexer = lexer
15+
16+
def process(self, error_info):
17+
self.tokens = [t for t in error_info['tokens'] if t is not None]
18+
self.bad_token = error_info['bad_token']
19+
self.expected_tokens = error_info['expected_tokens']
20+
21+
if len(self.tokens) == 0:
22+
return 'Empty input'
23+
24+
# show error location
25+
msgs = self.error_location()
26+
27+
# suggestion
28+
suggestions = self.make_suggestion()
29+
30+
if suggestions:
31+
prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: '
32+
msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions]))
33+
return '\n'.join(msgs)
34+
35+
def error_location(self):
36+
37+
# restore query text
38+
lines_idx = defaultdict(str)
39+
40+
# used + unused tokens
41+
for token in self.tokens:
42+
if token is None:
43+
continue
44+
line = lines_idx[token.lineno]
45+
46+
if len(line) > token.index:
47+
line = line[: token.index]
48+
else:
49+
line = line.ljust(token.index)
50+
51+
line += token.value
52+
lines_idx[token.lineno] = line
53+
54+
msgs = []
55+
56+
# error message and location
57+
if self.bad_token is None:
58+
msgs.append('Syntax error, unexpected end of query:')
59+
error_len = 1
60+
# last line
61+
error_line_num = list(lines_idx.keys())[-1]
62+
error_index = len(lines_idx[error_line_num])
63+
else:
64+
msgs.append('Syntax error, unknown input:')
65+
error_len = len(self.bad_token.value)
66+
error_line_num = self.bad_token.lineno
67+
error_index = self.bad_token.index
68+
69+
# shift lines indexes (it removes spaces from beginnings of the lines)
70+
lines = []
71+
shift = 0
72+
error_line = 0
73+
for i, line_num in enumerate(lines_idx.keys()):
74+
if line_num == error_line_num:
75+
error_index -= shift
76+
error_line = i
77+
78+
line = lines_idx[line_num]
79+
lines.append(line[shift:])
80+
shift = len(line)
81+
82+
# add source code
83+
first_line = error_line - 2 if error_line > 1 else 0
84+
for line in lines[first_line: error_line + 1]:
85+
msgs.append('>' + line)
86+
87+
# error position
88+
msgs.append('-' * (error_index + 1) + '^' * error_len)
89+
return msgs
90+
91+
def make_suggestion(self):
92+
if len(self.expected_tokens) == 0:
93+
return []
94+
95+
# find error index
96+
error_index = None
97+
for i, token in enumerate(self.tokens):
98+
if token is self.bad_token :
99+
error_index = i
100+
101+
expected = {} # value: token
102+
103+
for token_name in self.expected_tokens:
104+
value = getattr(self.lexer, token_name, None)
105+
if token_name == 'ID':
106+
# a lot of other tokens could be ID
107+
expected = {'[identifier]': token_name}
108+
break
109+
elif token_name in ('FLOAT', 'INTEGER'):
110+
expected['[number]'] = token_name
111+
112+
elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'):
113+
expected['[string]'] = token_name
114+
115+
elif isinstance(value, str):
116+
value = value.replace('\\b', '').replace('\\', '')
117+
118+
# doesn't content regexp
119+
if '\\s' not in value and '|' not in value:
120+
expected[value] = token_name
121+
122+
suggestions = []
123+
if len(expected) == 1:
124+
# use only it
125+
first_value = list(expected.keys())[0]
126+
suggestions.append(first_value)
127+
128+
elif 1 < len(expected) < 20:
129+
if self.bad_token is None:
130+
# if this is the end of query, just show next expected keywords
131+
return list(expected.keys())
132+
133+
# not every suggestion satisfy the end of the query. we have to check if it works
134+
for value, token_name in expected.items():
135+
# make up a token
136+
token = Token()
137+
token.type = token_name
138+
token.value = value
139+
token.end = 0
140+
token.index = 0
141+
token.lineno = 0
142+
143+
# try to add token
144+
tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:]
145+
if self.query_is_valid(tokens2):
146+
suggestions.append(value)
147+
continue
148+
149+
# try to replace token
150+
tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:]
151+
if self.query_is_valid(tokens2):
152+
suggestions.append(value)
153+
continue
154+
155+
return suggestions
156+
157+
def query_is_valid(self, tokens):
158+
# try to parse list of tokens
159+
160+
ast = self.parser.parse(iter(tokens))
161+
return ast is not None
162+
163+
164+
def parse_sql(sql):
165+
from mindsdb_parser.lexer import MindsDBLexer
166+
from mindsdb_parser.parser import MindsDBParser
167+
lexer, parser = MindsDBLexer(), MindsDBParser()
168+
169+
# remove ending semicolon and spaces
170+
sql = re.sub(r'[\s;]+$', '', sql)
171+
172+
tokens = lexer.tokenize(sql)
173+
ast = parser.parse(tokens)
174+
175+
if ast is None:
176+
177+
eh = ErrorHandling(lexer, parser)
178+
message = eh.process(parser.error_info)
179+
180+
raise ParsingException(message)
181+
182+
return ast

mindsdb_parser/ast/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from .base import ASTNode
2+
from .select import *
3+
from .show import *
4+
from .use import *
5+
from .describe import *
6+
from .set import *
7+
from .start_transaction import *
8+
from .rollback_transaction import *
9+
from .commit_transaction import *
10+
from .explain import *
11+
from .alter_table import *
12+
from .insert import *
13+
from .update import *
14+
from .delete import *
15+
from .drop import *
16+
from .create import *
17+
from .variable import *
18+
19+
from .mindsdb.latest import Latest

mindsdb_parser/ast/alter_table.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from mindsdb_parser.ast.base import ASTNode
2+
from mindsdb_parser.utils import indent
3+
4+
5+
class Alter(ASTNode):
6+
...
7+
8+
9+
class AlterTable(ASTNode):
10+
def __init__(self,
11+
target,
12+
arg,
13+
*args, **kwargs):
14+
super().__init__(*args, **kwargs)
15+
self.target = target
16+
self.arg = arg
17+
18+
def to_tree(self, *args, level=0, **kwargs):
19+
ind = indent(level)
20+
target_str = f'target={self.target.to_tree(level=level+2)}, '
21+
arg_str = f'arg={repr(self.arg)},'
22+
23+
out_str = f'{ind}AlterTable(' \
24+
f'{target_str}' \
25+
f'{arg_str}' \
26+
f'\n{ind})'
27+
return out_str
28+
29+
def get_string(self, *args, **kwargs):
30+
return f'ALTER TABLE {str(self.target)} {self.arg}'
31+

mindsdb_parser/ast/base.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import copy
2+
3+
from mindsdb_parser.exceptions import ParsingException
4+
from mindsdb_parser.utils import to_single_line
5+
6+
7+
class ASTNode:
8+
def __init__(self, alias=None, parentheses=False):
9+
self.alias = alias
10+
self.parentheses = parentheses
11+
12+
if self.alias and len(self.alias.parts) > 1:
13+
raise ParsingException('Alias can not contain multiple parts (dots).')
14+
15+
def maybe_add_alias(self, some_str, alias=True):
16+
if self.alias and alias:
17+
return f'{some_str} AS {self.alias.to_string(alias=False)}'
18+
else:
19+
return some_str
20+
21+
def maybe_add_parentheses(self, some_str):
22+
if self.parentheses:
23+
return f'({some_str})'
24+
else:
25+
return some_str
26+
27+
def to_tree(self, *args, **kwargs):
28+
pass
29+
30+
def get_string(self):
31+
pass
32+
33+
def to_string(self, alias=True):
34+
return self.maybe_add_alias(self.maybe_add_parentheses(self.get_string()), alias=alias)
35+
36+
def copy(self):
37+
return copy.deepcopy(self)
38+
39+
def __str__(self):
40+
return self.to_string()
41+
42+
def __eq__(self, other):
43+
if isinstance(other, ASTNode):
44+
return self.to_tree() == other.to_tree() and to_single_line(str(self)) == to_single_line(str(other))
45+
else:
46+
return False
47+
48+
def __repr__(self):
49+
sql = self.to_string().replace('\n', ' ')
50+
if len(sql) > 500:
51+
sql = sql[:500] + '...'
52+
return f'{self.__class__.__name__}:<{sql}>'
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from mindsdb_parser.ast.base import ASTNode
2+
from mindsdb_parser.utils import indent
3+
4+
5+
class CommitTransaction(ASTNode):
6+
def __init__(self,
7+
*args, **kwargs):
8+
super().__init__(*args, **kwargs)
9+
10+
def to_tree(self, *args, level=0, **kwargs):
11+
ind = indent(level)
12+
out_str = f'{ind}CommitTransaction()'
13+
return out_str
14+
15+
def get_string(self, *args, **kwargs):
16+
return f'commit'

0 commit comments

Comments
 (0)