diff --git a/.github/workflows/benchmark-sqlglot.yml b/.github/workflows/benchmark-sqlglot.yml index 6bdea3e9a9..19e396c245 100644 --- a/.github/workflows/benchmark-sqlglot.yml +++ b/.github/workflows/benchmark-sqlglot.yml @@ -20,26 +20,33 @@ jobs: - name: Create a virtual environment run: | python -m venv .venv - - name: Install dependencies - run: | source ./.venv/bin/activate python -m pip install --upgrade pip pip install pyperf - make install-dev - make install-dev-rs-release - name: Run benchmark on PR branch run: | source ./.venv/bin/activate + make install-dev + make install-dev-rs-release python benchmarks/parse.py --quiet --output bench_parse_pr.json python benchmarks/optimize.py --quiet --fast --output bench_optimize_pr.json - name: Checkout main branch into subdir run: | git fetch origin main git worktree add main-branch origin/main + - name: Reset virtual environment + run: | + rm -rf .venv + python -m venv .venv + source ./.venv/bin/activate + python -m pip install --upgrade pip + pip install pyperf - name: Run benchmark on main branch run: | source ./.venv/bin/activate cd main-branch + make install-dev + make install-dev-rs-release python benchmarks/parse.py --quiet --output ../bench_parse_main.json python benchmarks/optimize.py --quiet --fast --output ../bench_optimize_main.json cd .. diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index 5af5d27d54..0a5667551f 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -1507,10 +1507,14 @@ def tokenize_rs(self, sql: str) -> t.List[Token]: if not self._RS_TOKENIZER: raise SqlglotError("Rust tokenizer is not available") - try: - tokens = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) - for token in tokens: - token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] - return tokens - except Exception as e: - raise TokenError(str(e)) + tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) + for token in tokens: + token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] + + # Setting this here so partial token lists can be inspected even if there is a failure + self.tokens = tokens + + if error_msg is not None: + raise TokenError(error_msg) + + return tokens diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 9849299a10..6ca02a1421 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -1,7 +1,6 @@ use crate::settings::TokenType; use crate::trie::{Trie, TrieResult}; use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; -use pyo3::exceptions::PyException; use pyo3::prelude::*; use std::cmp::{max, min}; @@ -45,7 +44,7 @@ impl Tokenizer { &self, sql: &str, dialect_settings: &TokenizerDialectSettings, - ) -> Result, PyErr> { + ) -> (Vec, Option) { let mut state = TokenizerState::new( sql, &self.settings, @@ -53,9 +52,14 @@ impl Tokenizer { dialect_settings, &self.keyword_trie, ); - state.tokenize().map_err(|e| { - PyException::new_err(format!("Error tokenizing '{}': {}", e.context, e.message)) - }) + let tokenize_result = state.tokenize(); + match tokenize_result { + Ok(tokens) => (tokens, None), + Err(e) => { + let msg = format!("Error tokenizing '{}': {}", e.context, e.message); + (state.tokens, Some(msg)) + } + } } } diff --git a/tests/test_tokens.py b/tests/test_tokens.py index d9242eff28..c2f1255e17 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -186,3 +186,18 @@ def test_jinja(self): (TokenType.STRING, ") }}"), ], ) + + def test_partial_token_list(self): + tokenizer = Tokenizer() + + try: + # This is expected to fail due to the unbalanced string quotes + tokenizer.tokenize("foo 'bar") + except TokenError as e: + self.assertIn("Error tokenizing 'foo 'ba'", str(e)) + + partial_tokens = tokenizer.tokens + + self.assertEqual(len(partial_tokens), 1) + self.assertEqual(partial_tokens[0].token_type, TokenType.VAR) + self.assertEqual(partial_tokens[0].text, "foo")