mindsdb
diff --git a/‎README.md‎
Lines changed: 77 additions & 0 deletions b/‎README.md‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎mindsdb_parser/__about__.py‎
Lines changed: 10 additions & 0 deletions b/‎mindsdb_parser/__about__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎mindsdb_parser/__init__.py‎
Lines changed: 182 additions & 0 deletions b/‎mindsdb_parser/__init__.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎mindsdb_parser/ast/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎mindsdb_parser/ast/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎mindsdb_parser/ast/alter_table.py‎
Lines changed: 31 additions & 0 deletions b/‎mindsdb_parser/ast/alter_table.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎mindsdb_parser/ast/base.py‎
Lines changed: 52 additions & 0 deletions b/‎mindsdb_parser/ast/base.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎mindsdb_parser/ast/commit_transaction.py‎
Lines changed: 16 additions & 0 deletions b/‎mindsdb_parser/ast/commit_transaction.py‎
Lines changed: 16 additions & 0 deletions
@@ -1 +1,78 @@
 # MindsDB SQL Parser 🚧
+
+
+# Installation
+
+```
+  pip install mindsdb_parser
+```
+
+## How to use
+
+```python
+
+from mindsdb_parser import parse_sql
+
+query = parse_sql('select b from aaa where c=1')
+
+# result is abstract syntax tree (AST) 
+query
+
+# string representation of AST
+query.to_tree()
+
+# representation of tree as sql string. it can not exactly match with original sql
+query.to_string()
+
+```
+
+## Architecture
+
+For parsing is used [SLY](https://sly.readthedocs.io/en/latest/sly.html) library.
+
+Parsing consists of 2 stages, (separate module for every dialect): 
+- Defining keywords in lexer.py module. It is made mostly with regexp 
+- Defining syntax rules in parser.py module. It is made by describing rules in [BNF grammar](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form)
+  - Syntax is defined in decorator of function. Inside of decorator you can use keyword itself or other function from parser
+  - Output of function can be used as input in other functions of parser
+  - Outputs of the parser is listed in "Top-level statements". It has to be Abstract syntax tree (AST) object.
+
+SLY does not support inheritance, therefore every dialect is described completely, without extension one from another.  
+
+### [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
+- Structure of AST is defined in separate modules (in parser/ast/).
+- It can be inherited
+- Every class have to have these methods:
+  - to_tree - to return hierarchical representation of object
+  - get_string - to return object as sql expression (or sub-expression)
+  - copy - to copy AST-tree to new object
+
+### Error handling
+
+For better user experience parsing error contains useful information about problem location and possible solution to solve it. 
+1. it shows location of error if 
+  - character isn't parsed (by lexer)
+  - token is unexpected (by parser)
+2. it tries to propose correct token instead (or before) error location. Possible options
+  - Keyword will be showed as is.
+  - '[number]' - if float and integer is expected
+  - '[string]' - if string is expected
+  - '[identifier]' - if name of the objects is expected. For example, they are bold words here:
+    - "select **x** as **name** from **tbl1** where **col**=1"
+
+How suggestion works:
+It uses next possible tokens defined by syntax rules.
+If this is the end of the query: just shows these tokens.
+Else:
+- it tries to replace bad token with other token from list of possible tokens
+- tries to parse query once again, if there is no error:
+  - add this token to suggestion list
+- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation.
+
+
+# How to test
+
+```bash
+pip install -r requierements_test.txt
+env PYTHONPATH=./ pytest
+```
@@ -0,0 +1,10 @@
+__title__ = 'mindsdb_parser'
+__package_name__ = 'mindsdb_parser'
+__version__ = '0.0.1'
+__description__ = "Mindsdb SQL parser"
+__email__ = "[email protected]"
+__author__ = 'MindsDB Inc'
+__github__ = 'https://github.com/mindsdb/mindsdb_sql_parser'
+__pypi__ = 'https://pypi.org/project/mindsdb_parser'
+__license__ = 'MIT'
+__copyright__ = 'Copyright 2021- mindsdb'
@@ -0,0 +1,182 @@
+import re
+from collections import defaultdict
+
+from sly.lex import Token
+
+from mindsdb_parser.exceptions import ParsingException
+from mindsdb_parser.ast import *
+
+
+class ErrorHandling:
+
+    def __init__(self, lexer, parser):
+        self.parser = parser
+        self.lexer = lexer
+
+    def process(self, error_info):
+        self.tokens = [t for t in error_info['tokens'] if t is not None]
+        self.bad_token = error_info['bad_token']
+        self.expected_tokens = error_info['expected_tokens']
+
+        if len(self.tokens) == 0:
+            return 'Empty input'
+
+        # show error location
+        msgs = self.error_location()
+
+        # suggestion
+        suggestions = self.make_suggestion()
+
+        if suggestions:
+            prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: '
+            msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions]))
+        return '\n'.join(msgs)
+
+    def error_location(self):
+
+        # restore query text
+        lines_idx = defaultdict(str)
+
+        # used + unused tokens
+        for token in self.tokens:
+            if token is None:
+                continue
+            line = lines_idx[token.lineno]
+
+            if len(line) > token.index:
+                line = line[: token.index]
+            else:
+                line = line.ljust(token.index)
+
+            line += token.value
+            lines_idx[token.lineno] = line
+
+        msgs = []
+
+        # error message and location
+        if self.bad_token is None:
+            msgs.append('Syntax error, unexpected end of query:')
+            error_len = 1
+            # last line
+            error_line_num = list(lines_idx.keys())[-1]
+            error_index = len(lines_idx[error_line_num])
+        else:
+            msgs.append('Syntax error, unknown input:')
+            error_len = len(self.bad_token.value)
+            error_line_num = self.bad_token.lineno
+            error_index = self.bad_token.index
+
+        # shift lines indexes (it removes spaces from beginnings of the lines)
+        lines = []
+        shift = 0
+        error_line = 0
+        for i, line_num in enumerate(lines_idx.keys()):
+            if line_num == error_line_num:
+                error_index -= shift
+                error_line = i
+
+            line = lines_idx[line_num]
+            lines.append(line[shift:])
+            shift = len(line)
+
+        # add source code
+        first_line = error_line - 2 if error_line > 1 else 0
+        for line in lines[first_line: error_line + 1]:
+            msgs.append('>' + line)
+
+        # error position
+        msgs.append('-' * (error_index + 1) + '^' * error_len)
+        return msgs
+
+    def make_suggestion(self):
+        if len(self.expected_tokens) == 0:
+            return []
+
+        # find error index
+        error_index = None
+        for i, token in enumerate(self.tokens):
+            if token is self.bad_token :
+                error_index = i
+
+        expected = {}  # value: token
+
+        for token_name in self.expected_tokens:
+            value = getattr(self.lexer, token_name, None)
+            if token_name == 'ID':
+                # a lot of other tokens could be ID
+                expected = {'[identifier]': token_name}
+                break
+            elif token_name in ('FLOAT', 'INTEGER'):
+                expected['[number]'] = token_name
+
+            elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'):
+                expected['[string]'] = token_name
+
+            elif isinstance(value, str):
+                value = value.replace('\\b', '').replace('\\', '')
+
+                # doesn't content regexp
+                if '\\s' not in value and '|' not in value:
+                    expected[value] = token_name
+
+        suggestions = []
+        if len(expected) == 1:
+            # use only it
+            first_value = list(expected.keys())[0]
+            suggestions.append(first_value)
+
+        elif 1 < len(expected) < 20:
+            if self.bad_token is None:
+                # if this is the end of query, just show next expected keywords
+                return list(expected.keys())
+
+            # not every suggestion satisfy the end of the query. we have to check if it works
+            for value, token_name in expected.items():
+                # make up a token
+                token = Token()
+                token.type = token_name
+                token.value = value
+                token.end = 0
+                token.index = 0
+                token.lineno = 0
+
+                # try to add token
+                tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+                # try to replace token
+                tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+        return suggestions
+
+    def query_is_valid(self, tokens):
+        # try to parse list of tokens
+
+        ast = self.parser.parse(iter(tokens))
+        return ast is not None
+
+
+def parse_sql(sql):
+    from mindsdb_parser.lexer import MindsDBLexer
+    from mindsdb_parser.parser import MindsDBParser
+    lexer, parser = MindsDBLexer(), MindsDBParser()
+
+    # remove ending semicolon and spaces
+    sql = re.sub(r'[\s;]+$', '', sql)
+
+    tokens = lexer.tokenize(sql)
+    ast = parser.parse(tokens)
+
+    if ast is None:
+
+        eh = ErrorHandling(lexer, parser)
+        message = eh.process(parser.error_info)
+
+        raise ParsingException(message)
+
+    return ast
@@ -0,0 +1,19 @@
+from .base import ASTNode
+from .select import *
+from .show import *
+from .use import *
+from .describe import *
+from .set import *
+from .start_transaction import *
+from .rollback_transaction import *
+from .commit_transaction import *
+from .explain import *
+from .alter_table import *
+from .insert import *
+from .update import *
+from .delete import *
+from .drop import *
+from .create import *
+from .variable import *
+
+from .mindsdb.latest import Latest
@@ -0,0 +1,31 @@
+from mindsdb_parser.ast.base import ASTNode
+from mindsdb_parser.utils import indent
+
+
+class Alter(ASTNode):
+    ...
+
+
+class AlterTable(ASTNode):
+    def __init__(self,
+                 target,
+                 arg,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.target = target
+        self.arg = arg
+
+    def to_tree(self, *args, level=0, **kwargs):
+        ind = indent(level)
+        target_str = f'target={self.target.to_tree(level=level+2)}, '
+        arg_str = f'arg={repr(self.arg)},'
+
+        out_str = f'{ind}AlterTable(' \
+                  f'{target_str}' \
+                  f'{arg_str}' \
+                  f'\n{ind})'
+        return out_str
+
+    def get_string(self, *args, **kwargs):
+        return f'ALTER TABLE {str(self.target)} {self.arg}'
+
@@ -0,0 +1,52 @@
+import copy
+
+from mindsdb_parser.exceptions import ParsingException
+from mindsdb_parser.utils import to_single_line
+
+
+class ASTNode:
+    def __init__(self, alias=None, parentheses=False):
+        self.alias = alias
+        self.parentheses = parentheses
+
+        if self.alias and len(self.alias.parts) > 1:
+            raise ParsingException('Alias can not contain multiple parts (dots).')
+
+    def maybe_add_alias(self, some_str, alias=True):
+        if self.alias and alias:
+            return f'{some_str} AS {self.alias.to_string(alias=False)}'
+        else:
+            return some_str
+
+    def maybe_add_parentheses(self, some_str):
+        if self.parentheses:
+            return f'({some_str})'
+        else:
+            return some_str
+
+    def to_tree(self, *args, **kwargs):
+        pass
+
+    def get_string(self):
+        pass
+
+    def to_string(self, alias=True):
+        return self.maybe_add_alias(self.maybe_add_parentheses(self.get_string()), alias=alias)
+
+    def copy(self):
+        return copy.deepcopy(self)
+
+    def __str__(self):
+        return self.to_string()
+
+    def __eq__(self, other):
+        if isinstance(other, ASTNode):
+            return self.to_tree() == other.to_tree() and to_single_line(str(self)) == to_single_line(str(other))
+        else:
+            return False
+
+    def __repr__(self):
+        sql = self.to_string().replace('\n', ' ')
+        if len(sql) > 500:
+            sql = sql[:500] + '...'
+        return f'{self.__class__.__name__}:<{sql}>'
@@ -0,0 +1,16 @@
+from mindsdb_parser.ast.base import ASTNode
+from mindsdb_parser.utils import indent
+
+
+class CommitTransaction(ASTNode):
+    def __init__(self,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def to_tree(self, *args, level=0, **kwargs):
+        ind = indent(level)
+        out_str = f'{ind}CommitTransaction()'
+        return out_str
+
+    def get_string(self, *args, **kwargs):
+        return f'commit'