Skip to content

Commit f8b0829

Browse files
committed
Fix(rust-tokenizer)!: return token vector in tokenize even on failure
1 parent 1a8e78b commit f8b0829

File tree

4 files changed

+46
-16
lines changed

4 files changed

+46
-16
lines changed

.github/workflows/benchmark-sqlglot.yml

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,33 @@ jobs:
2020
- name: Create a virtual environment
2121
run: |
2222
python -m venv .venv
23-
- name: Install dependencies
24-
run: |
2523
source ./.venv/bin/activate
2624
python -m pip install --upgrade pip
2725
pip install pyperf
28-
make install-dev
29-
make install-dev-rs-release
3026
- name: Run benchmark on PR branch
3127
run: |
3228
source ./.venv/bin/activate
29+
make install-dev
30+
make install-dev-rs-release
3331
python benchmarks/parse.py --quiet --output bench_parse_pr.json
3432
python benchmarks/optimize.py --quiet --fast --output bench_optimize_pr.json
3533
- name: Checkout main branch into subdir
3634
run: |
3735
git fetch origin main
3836
git worktree add main-branch origin/main
37+
- name: Reset virtual environment
38+
run: |
39+
rm -rf .venv
40+
python -m venv .venv
41+
source ./.venv/bin/activate
42+
python -m pip install --upgrade pip
43+
pip install pyperf
3944
- name: Run benchmark on main branch
4045
run: |
4146
source ./.venv/bin/activate
4247
cd main-branch
48+
make install-dev
49+
make install-dev-rs-release
4350
python benchmarks/parse.py --quiet --output ../bench_parse_main.json
4451
python benchmarks/optimize.py --quiet --fast --output ../bench_optimize_main.json
4552
cd ..

sqlglot/tokens.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,10 +1507,14 @@ def tokenize_rs(self, sql: str) -> t.List[Token]:
15071507
if not self._RS_TOKENIZER:
15081508
raise SqlglotError("Rust tokenizer is not available")
15091509

1510-
try:
1511-
tokens = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
1512-
for token in tokens:
1513-
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
1514-
return tokens
1515-
except Exception as e:
1516-
raise TokenError(str(e))
1510+
tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
1511+
for token in tokens:
1512+
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
1513+
1514+
# Setting this here so partial token lists can be inspected even if there is a failure
1515+
self.tokens = tokens
1516+
1517+
if error_msg is not None:
1518+
raise TokenError(error_msg)
1519+
1520+
return tokens

sqlglotrs/src/tokenizer.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::settings::TokenType;
22
use crate::trie::{Trie, TrieResult};
33
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
4-
use pyo3::exceptions::PyException;
54
use pyo3::prelude::*;
65
use std::cmp::{max, min};
76

@@ -45,17 +44,22 @@ impl Tokenizer {
4544
&self,
4645
sql: &str,
4746
dialect_settings: &TokenizerDialectSettings,
48-
) -> Result<Vec<Token>, PyErr> {
47+
) -> (Vec<Token>, Option<String>) {
4948
let mut state = TokenizerState::new(
5049
sql,
5150
&self.settings,
5251
&self.token_types,
5352
dialect_settings,
5453
&self.keyword_trie,
5554
);
56-
state.tokenize().map_err(|e| {
57-
PyException::new_err(format!("Error tokenizing '{}': {}", e.context, e.message))
58-
})
55+
let tokenize_result = state.tokenize();
56+
match tokenize_result {
57+
Ok(tokens) => (tokens, None),
58+
Err(e) => {
59+
let msg = format!("Error tokenizing '{}': {}", e.context, e.message);
60+
(state.tokens, Some(msg))
61+
}
62+
}
5963
}
6064
}
6165

tests/test_tokens.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,18 @@ def test_jinja(self):
186186
(TokenType.STRING, ") }}"),
187187
],
188188
)
189+
190+
def test_partial_token_list(self):
191+
tokenizer = Tokenizer()
192+
193+
try:
194+
# This is expected to fail due to the unbalanced string quotes
195+
tokenizer.tokenize("foo 'bar")
196+
except TokenError as e:
197+
self.assertIn("Error tokenizing 'foo 'ba'", str(e))
198+
199+
partial_tokens = tokenizer.tokens
200+
201+
self.assertEqual(len(partial_tokens), 1)
202+
self.assertEqual(partial_tokens[0].token_type, TokenType.VAR)
203+
self.assertEqual(partial_tokens[0].text, "foo")

0 commit comments

Comments
 (0)