mindsdb · ea-rus · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -0,0 +1,33 @@
+name: Build and publish to pypi
+
+on:
+  workflow_run:
+    workflows: ["Release"]
+    types:
+      - completed
+
+jobs:
+ #  Push a new release to PyPI
+  deploy_to_pypi:
+    name: Publish to PyPI
+    runs-on: ubuntu-latest
+    if: github.actor != 'mindsdbadmin'
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/[email protected]
+        with:
+          python-version: ${{ vars.CI_PYTHON_VERSION }}
+      - name: Install dependencies
+        run: |
+          pip install setuptools wheel twine
+      - name: Clean previous builds
+        run: rm -rf dist/ build/ *.egg-info
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          # This uses the version string from __about__.py, which we checked matches the git tag above
+          python setup.py sdist
+          twine upload dist/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,59 @@
+name: run unit tests
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        python-version: [3.8,3.9,'3.10']
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_test.txt
+        pip install pytest-cov
+        pip install --no-cache-dir -e .[test]
+    - name: Run unit tests
+      run: pytest -v
+      shell: bash
+
+
+  coverage:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 pytest pytest-cov
+          pip install -r requirements_test.txt
+
+      - name: Build coverage file
+        run: |
+          pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=mindsdb_sql_parser tests/ | tee pytest-coverage.txt
+
+      - name: Pytest coverage comment
+        uses: MishaKav/pytest-coverage-comment@main
+        with:
+          pytest-coverage-path: ./pytest-coverage.txt
+          junitxml-path: ./pytest.xml
+
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.venv
+.idea
+venv/
+__pycache__/
diff --git a/README.md b/README.md
@@ -1 +1,78 @@
 # MindsDB SQL Parser 🚧
+
+
+# Installation
+
+```
+  pip install mindsdb_sql_parser
+```
+
+## How to use
+
+```python
+
+from mindsdb_sql_parser import parse_sql
+
+query = parse_sql('select b from aaa where c=1')
+
+# result is abstract syntax tree (AST) 
+query
+
+# string representation of AST
+query.to_tree()
+
+# representation of tree as sql string. it can not exactly match with original sql
+query.to_string()
+
+```
+
+## Architecture
+
+For parsing is used [SLY](https://sly.readthedocs.io/en/latest/sly.html) library.
+
+Parsing consists of 2 stages, (separate module for every dialect): 
+- Defining keywords in lexer.py module. It is made mostly with regexp 
+- Defining syntax rules in parser.py module. It is made by describing rules in [BNF grammar](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form)
+  - Syntax is defined in decorator of function. Inside of decorator you can use keyword itself or other function from parser
+  - Output of function can be used as input in other functions of parser
+  - Outputs of the parser is listed in "Top-level statements". It has to be Abstract syntax tree (AST) object.
+
+SLY does not support inheritance, therefore every dialect is described completely, without extension one from another.  
+
+### [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
+- Structure of AST is defined in separate modules (in parser/ast/).
+- It can be inherited
+- Every class have to have these methods:
+  - to_tree - to return hierarchical representation of object
+  - get_string - to return object as sql expression (or sub-expression)
+  - copy - to copy AST-tree to new object
+
+### Error handling
+
+For better user experience parsing error contains useful information about problem location and possible solution to solve it. 
+1. it shows location of error if 
+  - character isn't parsed (by lexer)
+  - token is unexpected (by parser)
+2. it tries to propose correct token instead (or before) error location. Possible options
+  - Keyword will be showed as is.
+  - '[number]' - if float and integer is expected
+  - '[string]' - if string is expected
+  - '[identifier]' - if name of the objects is expected. For example, they are bold words here:
+    - "select **x** as **name** from **tbl1** where **col**=1"
+
+How suggestion works:
+It uses next possible tokens defined by syntax rules.
+If this is the end of the query: just shows these tokens.
+Else:
+- it tries to replace bad token with other token from list of possible tokens
+- tries to parse query once again, if there is no error:
+  - add this token to suggestion list
+- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation.
+
+
+# How to test
+
+```bash
+pip install -r requierements_test.txt
+env PYTHONPATH=./ pytest
+```
diff --git a/mindsdb_sql_parser/__about__.py b/mindsdb_sql_parser/__about__.py
@@ -0,0 +1,10 @@
+__title__ = 'mindsdb_sql_parser'
+__package_name__ = 'mindsdb_sql_parser'
+__version__ = '0.0.1'
+__description__ = "Mindsdb SQL parser"
+__email__ = "[email protected]"
+__author__ = 'MindsDB Inc'
+__github__ = 'https://github.com/mindsdb/mindsdb_sql_parser'
+__pypi__ = 'https://pypi.org/project/mindsdb_sql_parser'
+__license__ = 'MIT'
+__copyright__ = 'Copyright 2024- mindsdb'
diff --git a/mindsdb_sql_parser/__init__.py b/mindsdb_sql_parser/__init__.py
@@ -0,0 +1,182 @@
+import re
+from collections import defaultdict
+
+from sly.lex import Token
+
+from mindsdb_sql_parser.exceptions import ParsingException
+from mindsdb_sql_parser.ast import *
+
+
+class ErrorHandling:
+
+    def __init__(self, lexer, parser):
+        self.parser = parser
+        self.lexer = lexer
+
+    def process(self, error_info):
+        self.tokens = [t for t in error_info['tokens'] if t is not None]
+        self.bad_token = error_info['bad_token']
+        self.expected_tokens = error_info['expected_tokens']
+
+        if len(self.tokens) == 0:
+            return 'Empty input'
+
+        # show error location
+        msgs = self.error_location()
+
+        # suggestion
+        suggestions = self.make_suggestion()
+
+        if suggestions:
+            prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: '
+            msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions]))
+        return '\n'.join(msgs)
+
+    def error_location(self):
+
+        # restore query text
+        lines_idx = defaultdict(str)
+
+        # used + unused tokens
+        for token in self.tokens:
+            if token is None:
+                continue
+            line = lines_idx[token.lineno]
+
+            if len(line) > token.index:
+                line = line[: token.index]
+            else:
+                line = line.ljust(token.index)
+
+            line += token.value
+            lines_idx[token.lineno] = line
+
+        msgs = []
+
+        # error message and location
+        if self.bad_token is None:
+            msgs.append('Syntax error, unexpected end of query:')
+            error_len = 1
+            # last line
+            error_line_num = list(lines_idx.keys())[-1]
+            error_index = len(lines_idx[error_line_num])
+        else:
+            msgs.append('Syntax error, unknown input:')
+            error_len = len(self.bad_token.value)
+            error_line_num = self.bad_token.lineno
+            error_index = self.bad_token.index
+
+        # shift lines indexes (it removes spaces from beginnings of the lines)
+        lines = []
+        shift = 0
+        error_line = 0
+        for i, line_num in enumerate(lines_idx.keys()):
+            if line_num == error_line_num:
+                error_index -= shift
+                error_line = i
+
+            line = lines_idx[line_num]
+            lines.append(line[shift:])
+            shift = len(line)
+
+        # add source code
+        first_line = error_line - 2 if error_line > 1 else 0
+        for line in lines[first_line: error_line + 1]:
+            msgs.append('>' + line)
+
+        # error position
+        msgs.append('-' * (error_index + 1) + '^' * error_len)
+        return msgs
+
+    def make_suggestion(self):
+        if len(self.expected_tokens) == 0:
+            return []
+
+        # find error index
+        error_index = None
+        for i, token in enumerate(self.tokens):
+            if token is self.bad_token :
+                error_index = i
+
+        expected = {}  # value: token
+
+        for token_name in self.expected_tokens:
+            value = getattr(self.lexer, token_name, None)
+            if token_name == 'ID':
+                # a lot of other tokens could be ID
+                expected = {'[identifier]': token_name}
+                break
+            elif token_name in ('FLOAT', 'INTEGER'):
+                expected['[number]'] = token_name
+
+            elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'):
+                expected['[string]'] = token_name
+
+            elif isinstance(value, str):
+                value = value.replace('\\b', '').replace('\\', '')
+
+                # doesn't content regexp
+                if '\\s' not in value and '|' not in value:
+                    expected[value] = token_name
+
+        suggestions = []
+        if len(expected) == 1:
+            # use only it
+            first_value = list(expected.keys())[0]
+            suggestions.append(first_value)
+
+        elif 1 < len(expected) < 20:
+            if self.bad_token is None:
+                # if this is the end of query, just show next expected keywords
+                return list(expected.keys())
+
+            # not every suggestion satisfy the end of the query. we have to check if it works
+            for value, token_name in expected.items():
+                # make up a token
+                token = Token()
+                token.type = token_name
+                token.value = value
+                token.end = 0
+                token.index = 0
+                token.lineno = 0
+
+                # try to add token
+                tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+                # try to replace token
+                tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+        return suggestions
+
+    def query_is_valid(self, tokens):
+        # try to parse list of tokens
+
+        ast = self.parser.parse(iter(tokens))
+        return ast is not None
+
+
+def parse_sql(sql):
+    from mindsdb_sql_parser.lexer import MindsDBLexer
+    from mindsdb_sql_parser.parser import MindsDBParser
+    lexer, parser = MindsDBLexer(), MindsDBParser()
+
+    # remove ending semicolon and spaces
+    sql = re.sub(r'[\s;]+$', '', sql)
+
+    tokens = lexer.tokenize(sql)
+    ast = parser.parse(tokens)
+
+    if ast is None:
+
+        eh = ErrorHandling(lexer, parser)
+        message = eh.process(parser.error_info)
+
+        raise ParsingException(message)
+
+    return ast
diff --git a/mindsdb_sql_parser/ast/__init__.py b/mindsdb_sql_parser/ast/__init__.py
@@ -0,0 +1,19 @@
+from .base import ASTNode
+from .select import *
+from .show import *
+from .use import *
+from .describe import *
+from .set import *
+from .start_transaction import *
+from .rollback_transaction import *
+from .commit_transaction import *
+from .explain import *
+from .alter_table import *
+from .insert import *
+from .update import *
+from .delete import *
+from .drop import *
+from .create import *
+from .variable import *
+
+from .mindsdb.latest import Latest