-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Fix(rust-tokenizer)!: return token vector in tokenize even on failure
#5155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1507,10 +1507,14 @@ def tokenize_rs(self, sql: str) -> t.List[Token]: | |
| if not self._RS_TOKENIZER: | ||
| raise SqlglotError("Rust tokenizer is not available") | ||
|
|
||
| try: | ||
| tokens = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) | ||
| for token in tokens: | ||
| token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] | ||
| return tokens | ||
| except Exception as e: | ||
| raise TokenError(str(e)) | ||
| tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) | ||
|
||
| for token in tokens: | ||
| token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] | ||
|
|
||
| # Setting this here so partial token lists can be inspected even if there is a failure | ||
| self.tokens = tokens | ||
|
|
||
| if error_msg is not None: | ||
| raise TokenError(error_msg) | ||
|
|
||
| return tokens | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,6 @@ | ||
| use crate::settings::TokenType; | ||
| use crate::trie::{Trie, TrieResult}; | ||
| use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; | ||
| use pyo3::exceptions::PyException; | ||
| use pyo3::prelude::*; | ||
| use std::cmp::{max, min}; | ||
|
|
||
|
|
@@ -45,17 +44,22 @@ impl Tokenizer { | |
| &self, | ||
| sql: &str, | ||
| dialect_settings: &TokenizerDialectSettings, | ||
| ) -> Result<Vec<Token>, PyErr> { | ||
| ) -> (Vec<Token>, Option<String>) { | ||
| let mut state = TokenizerState::new( | ||
| sql, | ||
| &self.settings, | ||
| &self.token_types, | ||
| dialect_settings, | ||
| &self.keyword_trie, | ||
| ); | ||
| state.tokenize().map_err(|e| { | ||
| PyException::new_err(format!("Error tokenizing '{}': {}", e.context, e.message)) | ||
| }) | ||
| let tokenize_result = state.tokenize(); | ||
| match tokenize_result { | ||
|
||
| Ok(tokens) => (tokens, None), | ||
| Err(e) => { | ||
| let msg = format!("Error tokenizing '{}': {}", e.context, e.message); | ||
| (state.tokens, Some(msg)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Consider adding a comment explaining the rationale for resetting the virtual environment to aid future maintainers in understanding the workflow steps for reproducible benchmarks.