Skip to content

Commit e7621ea

Browse files
[Fix] Fix Multi-byte unicode characters in StructuralTagItem. (#396)
As reported in #392 , Multi-byte characters in StructuralTagItem `end` / `begin` fields causes a crash. It is caused by the incorrect processing of multi-byte characters in building grammar from StructuralTagItem, and this mistake leads to the crash in grammar compiler. This PR fixes the problem. Signed-off-by: Yuchuan <[email protected]> --------- Signed-off-by: Yuchuan <[email protected]>
1 parent f1b913e commit e7621ea

File tree

3 files changed

+18
-7
lines changed

3 files changed

+18
-7
lines changed

cpp/grammar_builder.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,10 @@ class GrammarBuilder {
9595
* \param str The string to be added.
9696
*/
9797
int32_t AddByteString(const std::string& str) {
98-
std::vector<int32_t> bytes(str.begin(), str.end());
98+
std::vector<int32_t> bytes;
99+
for (char c : str) {
100+
bytes.push_back(static_cast<int32_t>(static_cast<uint8_t>(c)));
101+
}
99102
return AddGrammarExpr(
100103
{GrammarExprType::kByteString, bytes.data(), static_cast<int32_t>(bytes.size())}
101104
);

cpp/grammar_parser.cc

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -625,12 +625,7 @@ int32_t EBNFParser::ParseString() {
625625
return builder_.AddEmptyStr();
626626
}
627627

628-
// Convert string to bytes
629-
std::vector<int32_t> bytes;
630-
for (auto c : str_value) {
631-
bytes.push_back(static_cast<int32_t>(static_cast<uint8_t>(c)));
632-
}
633-
return builder_.AddByteString(bytes);
628+
return builder_.AddByteString(str_value);
634629
}
635630

636631
int32_t EBNFParser::ParseRuleRef() {

tests/python/test_grammar_matcher_structural_tag.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,5 +248,18 @@ def test_empty_tag_dispatch():
248248
assert not _is_grammar_accept_string(grammar_with_stop_str, "aaa")
249249

250250

251+
@pytest.mark.hf_token_required
252+
def test_utf8_structural_tag_begin_end():
253+
model = "deepseek-ai/DeepSeek-V3-0324"
254+
tokenizer = AutoTokenizer.from_pretrained(model)
255+
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
256+
compiler = xgr.GrammarCompiler(tokenizer_info)
257+
structures = [
258+
xgr.StructuralTagItem(begin="<|tool▁calls▁begin|>", schema={}, end="<|tool▁calls▁end|>")
259+
]
260+
triggers = ["<|tool▁calls▁begin|>"]
261+
_ = compiler.compile_structural_tag(structures, triggers)
262+
263+
251264
if __name__ == "__main__":
252265
pytest.main(sys.argv)

0 commit comments

Comments
 (0)