Skip to content

Commit 11a95bc

Browse files
committed
Fix(rust-tokenizer)!: return token vector in tokenize even on failure
1 parent 1a8e78b commit 11a95bc

File tree

4 files changed

+43
-15
lines changed

4 files changed

+43
-15
lines changed

.github/workflows/benchmark-sqlglot.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,9 @@ jobs:
1717
uses: actions/setup-python@v5
1818
with:
1919
python-version: 3.13
20-
- name: Create a virtual environment
21-
run: |
22-
python -m venv .venv
2320
- name: Install dependencies
2421
run: |
22+
python -m venv .venv
2523
source ./.venv/bin/activate
2624
python -m pip install --upgrade pip
2725
pip install pyperf
@@ -36,6 +34,13 @@ jobs:
3634
run: |
3735
git fetch origin main
3836
git worktree add main-branch origin/main
37+
rm -rf .venv
38+
python -m venv .venv
39+
source ./.venv/bin/activate
40+
python -m pip install --upgrade pip
41+
pip install pyperf
42+
make install-dev
43+
make install-dev-rs-release
3944
- name: Run benchmark on main branch
4045
run: |
4146
source ./.venv/bin/activate

sqlglot/tokens.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,10 +1507,14 @@ def tokenize_rs(self, sql: str) -> t.List[Token]:
15071507
if not self._RS_TOKENIZER:
15081508
raise SqlglotError("Rust tokenizer is not available")
15091509

1510-
try:
1511-
tokens = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
1512-
for token in tokens:
1513-
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
1514-
return tokens
1515-
except Exception as e:
1516-
raise TokenError(str(e))
1510+
tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
1511+
for token in tokens:
1512+
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
1513+
1514+
# Setting this here so partial token lists can be inspected even if there is a failure
1515+
self.tokens = tokens
1516+
1517+
if error_msg is not None:
1518+
raise TokenError(error_msg)
1519+
1520+
return tokens

sqlglotrs/src/tokenizer.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::settings::TokenType;
22
use crate::trie::{Trie, TrieResult};
33
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
4-
use pyo3::exceptions::PyException;
54
use pyo3::prelude::*;
65
use std::cmp::{max, min};
76

@@ -45,17 +44,22 @@ impl Tokenizer {
4544
&self,
4645
sql: &str,
4746
dialect_settings: &TokenizerDialectSettings,
48-
) -> Result<Vec<Token>, PyErr> {
47+
) -> (Vec<Token>, Option<String>) {
4948
let mut state = TokenizerState::new(
5049
sql,
5150
&self.settings,
5251
&self.token_types,
5352
dialect_settings,
5453
&self.keyword_trie,
5554
);
56-
state.tokenize().map_err(|e| {
57-
PyException::new_err(format!("Error tokenizing '{}': {}", e.context, e.message))
58-
})
55+
let tokenize_result = state.tokenize();
56+
match tokenize_result {
57+
Ok(tokens) => (tokens, None),
58+
Err(e) => {
59+
let msg = format!("Error tokenizing '{}': {}", e.context, e.message);
60+
(state.tokens, Some(msg))
61+
}
62+
}
5963
}
6064
}
6165

tests/test_tokens.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,18 @@ def test_jinja(self):
186186
(TokenType.STRING, ") }}"),
187187
],
188188
)
189+
190+
def test_partial_token_list(self):
191+
tokenizer = Tokenizer()
192+
193+
try:
194+
# This is expected to fail due to the unbalanced string quotes
195+
tokenizer.tokenize("foo 'bar")
196+
except TokenError as e:
197+
self.assertIn("Error tokenizing 'foo 'ba'", str(e))
198+
199+
partial_tokens = tokenizer.tokens
200+
201+
self.assertEqual(len(partial_tokens), 1)
202+
self.assertEqual(partial_tokens[0].token_type, TokenType.VAR)
203+
self.assertEqual(partial_tokens[0].text, "foo")

0 commit comments

Comments
 (0)