Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions .github/workflows/benchmark-sqlglot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,33 @@ jobs:
- name: Create a virtual environment
run: |
python -m venv .venv
- name: Install dependencies
run: |
source ./.venv/bin/activate
python -m pip install --upgrade pip
pip install pyperf
make install-dev
make install-dev-rs-release
- name: Run benchmark on PR branch
run: |
source ./.venv/bin/activate
make install-dev
make install-dev-rs-release
python benchmarks/parse.py --quiet --output bench_parse_pr.json
python benchmarks/optimize.py --quiet --fast --output bench_optimize_pr.json
- name: Checkout main branch into subdir
run: |
git fetch origin main
git worktree add main-branch origin/main
Copy link

Copilot AI Jun 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider adding a comment explaining the rationale for resetting the virtual environment to aid future maintainers in understanding the workflow steps for reproducible benchmarks.

Suggested change
git worktree add main-branch origin/main
git worktree add main-branch origin/main
# Reset the virtual environment to ensure a clean and consistent environment
# when switching between branches. This is critical for reproducible benchmarks.

Copilot uses AI. Check for mistakes.
- name: Reset virtual environment
run: |
rm -rf .venv
python -m venv .venv
source ./.venv/bin/activate
python -m pip install --upgrade pip
pip install pyperf
- name: Run benchmark on main branch
run: |
source ./.venv/bin/activate
cd main-branch
make install-dev
make install-dev-rs-release
python benchmarks/parse.py --quiet --output ../bench_parse_main.json
python benchmarks/optimize.py --quiet --fast --output ../bench_optimize_main.json
cd ..
Expand Down
18 changes: 11 additions & 7 deletions sqlglot/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -1507,10 +1507,14 @@ def tokenize_rs(self, sql: str) -> t.List[Token]:
if not self._RS_TOKENIZER:
raise SqlglotError("Rust tokenizer is not available")

try:
tokens = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
for token in tokens:
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]
return tokens
except Exception as e:
raise TokenError(str(e))
tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
Copy link

Copilot AI Jun 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update the RS tokenizer integration documentation or comments to note the new tuple return type, ensuring that downstream users understand that partial tokens are available even after a failure.

Copilot uses AI. Check for mistakes.
for token in tokens:
token.token_type = _ALL_TOKEN_TYPES[token.token_type_index]

# Setting this here so partial token lists can be inspected even if there is a failure
self.tokens = tokens

if error_msg is not None:
raise TokenError(error_msg)

return tokens
14 changes: 9 additions & 5 deletions sqlglotrs/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::settings::TokenType;
use crate::trie::{Trie, TrieResult};
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use pyo3::exceptions::PyException;
use pyo3::prelude::*;
use std::cmp::{max, min};

Expand Down Expand Up @@ -45,17 +44,22 @@ impl Tokenizer {
&self,
sql: &str,
dialect_settings: &TokenizerDialectSettings,
) -> Result<Vec<Token>, PyErr> {
) -> (Vec<Token>, Option<String>) {
let mut state = TokenizerState::new(
sql,
&self.settings,
&self.token_types,
dialect_settings,
&self.keyword_trie,
);
state.tokenize().map_err(|e| {
PyException::new_err(format!("Error tokenizing '{}': {}", e.context, e.message))
})
let tokenize_result = state.tokenize();
match tokenize_result {
Copy link

Copilot AI Jun 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding a comment that explains the new behavior of returning a tuple (partial tokens and an optional error message) so that API consumers are aware of how to handle tokenization failures.

Copilot uses AI. Check for mistakes.
Ok(tokens) => (tokens, None),
Err(e) => {
let msg = format!("Error tokenizing '{}': {}", e.context, e.message);
(state.tokens, Some(msg))
}
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions tests/test_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,18 @@ def test_jinja(self):
(TokenType.STRING, ") }}"),
],
)

def test_partial_token_list(self):
tokenizer = Tokenizer()

try:
# This is expected to fail due to the unbalanced string quotes
tokenizer.tokenize("foo 'bar")
except TokenError as e:
self.assertIn("Error tokenizing 'foo 'ba'", str(e))

partial_tokens = tokenizer.tokens

self.assertEqual(len(partial_tokens), 1)
self.assertEqual(partial_tokens[0].token_type, TokenType.VAR)
self.assertEqual(partial_tokens[0].text, "foo")