Skip to content

Commit 07e978f

Browse files
[Fix] Fix the efficiency issue in repetition structure. (#467)
As reported in #465, this PR's target is to fix the efficiency issue in the repetition structure. The issue is caused by two reasons in the repetition construction method: - The threshold is too small. - The construction of the other part, except for the repetition expression, is not good enough. This PR fixes the problem. The efficiency can be evaluated with the following code: ``` python import xgrammar as xgr from transformers import AutoTokenizer from xgrammar.testing import _get_matcher_from_grammar_and_tokenizer_info import time schema = { "additionalProperties": False, "properties": { "description": { "maxLength": 200, "minLength": 10, "title": "Some description", "type": "string" } }, "required": [ "description" ], "title": "Some object", "type": "object" } tokenizer_path = "meta-llama/Meta-Llama-3-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True, trust_remote_code=True) tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer) grammar = xgr.Grammar.from_json_schema(schema) matcher = _get_matcher_from_grammar_and_tokenizer_info(grammar, tokenizer_info) test_str = "{\"description\": \"1234567890123456789101234567891012345678910123456789101234567891" + \ "012345678910123456789101234567891012345678910123456789101234567891012345678910123456" + \ "78910123456789101234567897891012345678910123456789\"}" token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) tpot = [] print(grammar) for char in test_str: start_time = time.time_ns() matcher.fill_next_token_bitmask(token_bitmask) assert matcher.accept_string(char) end_time = time.time_ns() print(f"Processed char '{char}' in {(end_time - start_time)/1e3} us") tpot.append((end_time - start_time)/1e3) tpot.sort() print(f"avg: {sum(tpot) / len(tpot)} us") print(f"max: {tpot[-1]} us") print(f"p50: {tpot[int(0.5 * len(tpot))]} us") print(f"p90: {tpot[int(0.9 * len(tpot))]} us") ``` And the results are listed below: ``` v0.1.21 avg: 544.5573364055299 us max: 6720.648 us p50: 332.969 us p90: 620.647 us main branch avg: 4789.501852534562 us max: 7862.335 us p50: 5813.238 us p90: 6398.427 us This PR avg: 353.5097096774194 us max: 1064.672 us p50: 449.098 us p90: 664.901 us ``` --------- Signed-off-by: Yuchuan <[email protected]>
1 parent 47658a3 commit 07e978f

File tree

4 files changed

+126
-101
lines changed

4 files changed

+126
-101
lines changed

cpp/grammar_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ class Grammar::Impl {
125125
// tag_expr should be a byte string, and rule_id should be a rule id.
126126
// loop_after_dispatch is a bool.
127127
kTagDispatch,
128-
// data format: [grammar_expr_id, min_repeat_count, max_repeat_count]
128+
// data format: [rule_id, min_repeat_count, max_repeat_count]
129129
kRepeat,
130130
};
131131

cpp/grammar_parser.cc

Lines changed: 68 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ class EBNFParser {
494494
int32_t HandlePlusQuantifier(int32_t grammar_expr_id);
495495
int32_t HandleQuestionQuantifier(int32_t grammar_expr_id);
496496
int32_t HandleRepetitionRange(int32_t grammar_expr_id, int64_t lower, int64_t upper);
497+
int32_t LegacyHandleRepetitionRange(int32_t grammar_expr_id, int64_t lower, int64_t upper);
497498

498499
// When parsing, we first find the names of all rules, and build the mapping from name to rule id.
499500
void InitRuleNames();
@@ -757,71 +758,84 @@ int32_t EBNFParser::HandleQuestionQuantifier(int32_t grammar_expr_id) {
757758
return builder_.AddRuleRef(new_rule_id);
758759
}
759760

761+
int32_t EBNFParser::LegacyHandleRepetitionRange(
762+
int32_t grammar_expr_id, int64_t lower, int64_t upper
763+
) {
764+
// Construct expr expr ... expr (l times)
765+
766+
std::vector<int32_t> elements;
767+
for (int64_t i = 0; i < lower; ++i) {
768+
elements.push_back(grammar_expr_id);
769+
}
770+
771+
// Case 1: {l}:
772+
// expr expr ... expr (l times)
773+
if (upper == lower) {
774+
return builder_.AddSequence(elements);
775+
}
776+
777+
// Case 2: {l,}:
778+
// expr expr ... expr (l times) rest
779+
// rest ::= "" | expr rest
780+
if (upper == -1) {
781+
auto new_rule_name = builder_.GetNewRuleName(cur_rule_name_);
782+
auto new_rule_id = builder_.AddEmptyRule(new_rule_name);
783+
auto ref_to_new_rule = builder_.AddRuleRef(new_rule_id);
784+
auto new_grammar_expr_id = builder_.AddChoices(
785+
{builder_.AddEmptyStr(), builder_.AddSequence({grammar_expr_id, ref_to_new_rule})}
786+
);
787+
builder_.UpdateRuleBody(new_rule_id, new_grammar_expr_id);
788+
elements.push_back(builder_.AddRuleRef(new_rule_id));
789+
return builder_.AddSequence(elements);
790+
}
791+
792+
// Case 3: {l, r} (r - l >= 1)
793+
// expr expr ... expr (l times) rest1
794+
// rest1 ::= "" | expr rest2
795+
// rest2 ::= "" | expr rest3
796+
// ...
797+
// rest(r - l) ::= "" | expr
798+
std::vector<int32_t> rest_rule_ids;
799+
800+
for (int64_t i = 0; i < upper - lower; ++i) {
801+
auto new_rule_name = builder_.GetNewRuleName(cur_rule_name_);
802+
rest_rule_ids.push_back(builder_.AddEmptyRule(new_rule_name));
803+
}
804+
for (int64_t i = 0; i < upper - lower - 1; ++i) {
805+
auto ref_to_next_rule = builder_.AddRuleRef(rest_rule_ids[i + 1]);
806+
auto new_grammar_expr_id = builder_.AddChoices(
807+
{builder_.AddEmptyStr(), builder_.AddSequence({grammar_expr_id, ref_to_next_rule})}
808+
);
809+
builder_.UpdateRuleBody(rest_rule_ids[i], new_grammar_expr_id);
810+
}
811+
auto last_grammar_expr_id = builder_.AddChoices({builder_.AddEmptyStr(), grammar_expr_id});
812+
builder_.UpdateRuleBody(rest_rule_ids.back(), last_grammar_expr_id);
813+
814+
elements.push_back(builder_.AddRuleRef(rest_rule_ids[0]));
815+
return builder_.AddSequence(elements);
816+
}
817+
760818
int32_t EBNFParser::HandleRepetitionRange(
761819
const int32_t grammar_expr_id, int64_t lower, int64_t upper
762820
) {
763-
static const int64_t kUnzipThreshold = 15;
821+
static const int64_t kUnzipThreshold = 128;
764822
XGRAMMAR_DCHECK(lower >= 0);
765823
XGRAMMAR_DCHECK(upper == -1 || upper >= lower);
766-
// Case 1. small (<=threshold), unzip the repetition.
767-
if (upper != -1 && upper <= kUnzipThreshold) {
768-
std::vector<int32_t> choices;
769-
if (lower == 0) {
770-
choices.push_back(builder_.AddEmptyStr());
771-
lower = 1; // We have already handled the empty string case.
772-
}
773-
for (int64_t count = lower; count <= upper; ++count) {
774-
std::vector<int32_t> sequence;
775-
for (int64_t i = 0; i < count; ++i) {
776-
sequence.push_back(grammar_expr_id);
777-
}
778-
choices.push_back(builder_.AddSequence(sequence));
779-
}
780-
return builder_.AddChoices(choices);
824+
// Case 1.1 small upper (<=threshold), unzip the repetition.
825+
// Case 1.2 unbounded upper, and lower is also small (<=threshold), unzip the lower part.
826+
if ((upper != -1 && upper <= kUnzipThreshold) || (upper == -1 && lower <= kUnzipThreshold)) {
827+
return LegacyHandleRepetitionRange(grammar_expr_id, lower, upper);
781828
}
782829

783-
// Case 2. upper is unbounded or large.
830+
// Case 2. upper is unbounded, and lower is large (>threshold).
831+
// Or upper is bounded, but upper > threshold.
784832

785833
// Case 2.1.1. lower is smaller than threshold, and upper is large. Transform {lower, upper} into:
786-
// {threshold, upper} | {lower} | ... | {threshold}.
787-
// Case 2.1.2. lower is smaller than threshold, and upper is unbounded. Unzip the
788-
// {lower} repetition, and add a star expression.
834+
// {threshold, upper} | {lower, threshold}
789835
std::vector<int32_t> choices;
790836
if (lower < kUnzipThreshold) {
791-
if (upper == -1) {
792-
int infinite_repetition_id = -1;
793-
const auto& rule_expr = builder_.GetGrammarExpr(grammar_expr_id);
794-
if (rule_expr.type == GrammarBuilder::GrammarExprType::kCharacterClass) {
795-
std::vector<GrammarBuilder::CharacterClassElement> character_ranges;
796-
bool is_negative = rule_expr[0];
797-
for (int i = 1; i < static_cast<int>(rule_expr.size()); i += 2) {
798-
character_ranges.push_back({rule_expr[i], rule_expr[i + 1]});
799-
}
800-
infinite_repetition_id = builder_.AddCharacterClassStar(character_ranges, is_negative);
801-
} else {
802-
const auto& unbounded_rule_id =
803-
builder_.AddEmptyRule(builder_.GetNewRuleName(cur_rule_name_ + "_repeat_inf"));
804-
int recursion_sequence =
805-
builder_.AddSequence({grammar_expr_id, builder_.AddRuleRef(unbounded_rule_id)});
806-
int recursion_choice = builder_.AddChoices({builder_.AddEmptyStr(), recursion_sequence});
807-
builder_.UpdateRuleBody(unbounded_rule_id, recursion_choice);
808-
infinite_repetition_id = builder_.AddRuleRef(unbounded_rule_id);
809-
}
810-
std::vector<int32_t> sequence(lower, grammar_expr_id);
811-
sequence.push_back(infinite_repetition_id);
812-
return builder_.AddSequence(sequence);
813-
}
814-
if (lower == 0) {
815-
choices.push_back(builder_.AddEmptyStr());
816-
lower = 1;
817-
}
818-
for (; lower < kUnzipThreshold; ++lower) {
819-
std::vector<int32_t> sequence;
820-
for (int64_t i = 0; i < lower; ++i) {
821-
sequence.push_back(grammar_expr_id);
822-
}
823-
choices.push_back(builder_.AddSequence(sequence));
824-
}
837+
choices.push_back(LegacyHandleRepetitionRange(grammar_expr_id, lower, kUnzipThreshold - 1));
838+
lower = kUnzipThreshold;
825839
}
826840

827841
std::optional<int32_t> infinite_repetition_id = std::nullopt;

tests/python/test_grammar_parser.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_repetition_range_exact():
146146
"""Test repetition range with exact count {n}."""
147147
before = """root ::= "a"{3}
148148
"""
149-
expected = """root ::= (((("a" "a" "a"))))
149+
expected = """root ::= ((("a" "a" "a")))
150150
"""
151151
grammar = _ebnf_to_grammar_no_normalization(before)
152152
after = str(grammar)
@@ -157,7 +157,9 @@ def test_repetition_range_min_max():
157157
"""Test repetition range with min and max {n,m}."""
158158
before = """root ::= "a"{2,4}
159159
"""
160-
expected = """root ::= (((("a" "a") | ("a" "a" "a") | ("a" "a" "a" "a"))))
160+
expected = """root ::= ((("a" "a" root_1)))
161+
root_1 ::= ("" | ("a" root_2))
162+
root_2 ::= ("" | "a")
161163
"""
162164
grammar = _ebnf_to_grammar_no_normalization(before)
163165
after = str(grammar)
@@ -168,8 +170,8 @@ def test_repetition_range_min_only():
168170
"""Test repetition range with only min {n,}."""
169171
before = """root ::= "a"{2,}
170172
"""
171-
expected = """root ::= ((("a" "a" root_repeat_inf)))
172-
root_repeat_inf ::= ("" | ("a" root_repeat_inf))
173+
expected = """root ::= ((("a" "a" root_1)))
174+
root_1 ::= ("" | ("a" root_1))
173175
"""
174176
grammar = _ebnf_to_grammar_no_normalization(before)
175177
after = str(grammar)
@@ -264,9 +266,11 @@ def test_combined_features():
264266
rule2 ::= [0-9]+ "." [0-9]*
265267
"""
266268
expected = """root ::= (("start" root_1 "end"))
267-
rule1 ::= (((([a-z]) | ([a-z] [a-z]) | ([a-z] [a-z] [a-z])))) (=((":")))
269+
rule1 ::= ((([a-z] rule1_1))) (=((":")))
268270
rule2 ::= ((rule2_1 "." [0-9]*))
269271
root_1 ::= ((((rule1) | (rule2)) root_1) | ((rule1) | (rule2)))
272+
rule1_1 ::= ("" | ([a-z] rule1_2))
273+
rule1_2 ::= ("" | [a-z])
270274
rule2_1 ::= (([0-9] rule2_1) | [0-9])
271275
"""
272276
grammar = _ebnf_to_grammar_no_normalization(before)
@@ -338,29 +342,26 @@ def test_repetition_range():
338342
"""
339343

340344
expected = """root ::= ((a b c d e f g))
341-
a ::= (("a") | ("a" "a"))
342-
b ::= ((a) | ("b") | (b_1 b_2) | (b_3 b_4 b_5) | (b_6 b_7 b_8 b_9) | (b_10 b_11 b_12 b_13 b_14))
343-
c ::= ("" | ("c") | ("c" "c"))
344-
d ::= ((d_repeat_inf))
345-
e ::= (("e" "e" e_repeat_inf))
345+
a ::= (("a" a_1))
346+
b ::= ((b_5 b_1))
347+
c ::= ((c_1))
348+
d ::= ((d_1))
349+
e ::= (("e" "e" e_1))
346350
f ::= (("f" "f" "f"))
347-
g ::= ("")
348-
d_repeat_inf ::= ("" | ("d" d_repeat_inf))
349-
e_repeat_inf ::= ("" | ("e" e_repeat_inf))
350-
b_1 ::= ((a) | ("b"))
351-
b_2 ::= ((a) | ("b"))
352-
b_3 ::= ((a) | ("b"))
353-
b_4 ::= ((a) | ("b"))
351+
g ::= (())
352+
a_1 ::= ("" | ("a"))
353+
b_1 ::= ("" | (b_1_1 b_2))
354+
b_2 ::= ("" | (b_2_1 b_3))
355+
b_3 ::= ("" | (b_3_1 b_4))
356+
b_4 ::= ("" | (a) | ("b"))
357+
c_1 ::= ("" | ("c" c_2))
358+
c_2 ::= ("" | ("c"))
359+
d_1 ::= ("" | ("d" d_1))
360+
e_1 ::= ("" | ("e" e_1))
354361
b_5 ::= ((a) | ("b"))
355-
b_6 ::= ((a) | ("b"))
356-
b_7 ::= ((a) | ("b"))
357-
b_8 ::= ((a) | ("b"))
358-
b_9 ::= ((a) | ("b"))
359-
b_10 ::= ((a) | ("b"))
360-
b_11 ::= ((a) | ("b"))
361-
b_12 ::= ((a) | ("b"))
362-
b_13 ::= ((a) | ("b"))
363-
b_14 ::= ((a) | ("b"))
362+
b_1_1 ::= ((a) | ("b"))
363+
b_2_1 ::= ((a) | ("b"))
364+
b_3_1 ::= ((a) | ("b"))
364365
"""
365366

366367
grammar = _ebnf_to_grammar_no_normalization(before)
@@ -776,17 +777,17 @@ def test_error_consecutive_quantifiers():
776777

777778
def test_repetition_normalizer():
778779
"""Test the repetition normalizer. If the context is nullable, then the min repetition time will be reduced to 0."""
779-
before = "root ::= ([0-9]*){100, 1000}"
780-
expected_grammar = r"""root ::= ((root_repeat_1{0, 985} [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]*))
781-
root_repeat_1 ::= (([0-9]*)) (=([0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]*))
780+
before = "root ::= ([0-9]*){200, 1000}"
781+
expected_grammar = r"""root ::= ((root_repeat_1{0, 872} [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]*))
782+
root_repeat_1 ::= (([0-9]*)) (=([0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]* [0-9]*))
782783
"""
783784
grammar = xgr.Grammar.from_ebnf(before)
784785
grammar = GrammarFunctor.grammar_optimizer(grammar)
785786
assert expected_grammar == str(grammar)
786787

787-
before = "root ::= ([0-9]){100, 1000}"
788-
expected_grammar = r"""root ::= ((root_repeat_1{85, 985} [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9]))
789-
root_repeat_1 ::= (([0-9])) (=([0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9]))
788+
before = "root ::= ([0-9]){200, 1000}"
789+
expected_grammar = r"""root ::= ((root_repeat_1{72, 872} [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9]))
790+
root_repeat_1 ::= (([0-9])) (=([0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9] [0-9]))
790791
"""
791792
grammar = xgr.Grammar.from_ebnf(before)
792793
grammar = GrammarFunctor.grammar_optimizer(grammar)

tests/python/test_json_schema_converter.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2178,13 +2178,18 @@ def test_generate_float_regex():
21782178
def test_limited_whitespace_cnt():
21792179
expected_grammar = r"""basic_escape ::= (([\"\\/bfnrt]) | ("u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9])) (=(basic_string_sub))
21802180
basic_string_sub ::= (("\"") | ([^\0-\x1f\"\\\r\n] basic_string_sub) | ("\\" basic_escape basic_string_sub)) (=(basic_string_sub_1 [,}\]:]))
2181-
basic_string ::= (("\"" basic_string_sub)) (=(root_4 "}"))
2182-
root ::= (("{" root_1 "\"key\"" root_2 ":" root_3 basic_string root_4 "}"))
2183-
basic_string_sub_1 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t]))
2184-
root_1 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=("\"key\"" root_2 ":" root_3 basic_string root_4 "}"))
2185-
root_2 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=(":" root_3 basic_string root_4 "}"))
2186-
root_3 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=(basic_string root_4 "}"))
2187-
root_4 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=("}"))
2181+
basic_string ::= (("\"" basic_string_sub)) (=(root_7 "}"))
2182+
root ::= (("{" root_1 "\"key\"" root_3 ":" root_5 basic_string root_7 "}"))
2183+
basic_string_sub_1 ::= ("" | ([ \n\t] basic_string_sub_2))
2184+
basic_string_sub_2 ::= ("" | ([ \n\t]))
2185+
root_1 ::= ("" | ([ \n\t] root_2)) (=("\"key\"" root_3 ":" root_5 basic_string root_7 "}"))
2186+
root_2 ::= ("" | ([ \n\t]))
2187+
root_3 ::= ("" | ([ \n\t] root_4)) (=(":" root_5 basic_string root_7 "}"))
2188+
root_4 ::= ("" | ([ \n\t]))
2189+
root_5 ::= ("" | ([ \n\t] root_6)) (=(basic_string root_7 "}"))
2190+
root_6 ::= ("" | ([ \n\t]))
2191+
root_7 ::= ("" | ([ \n\t] root_8)) (=("}"))
2192+
root_8 ::= ("" | ([ \n\t]))
21882193
"""
21892194
schema = {"type": "object", "properties": {"key": {"type": "string"}}, "required": ["key"]}
21902195
grammar = xgr.Grammar.from_json_schema(schema, any_whitespace=True, max_whitespace_cnt=2)
@@ -2200,13 +2205,18 @@ def test_limited_whitespace_cnt():
22002205
def test_limited_whitespace_compile():
22012206
expected_grammar = r"""basic_escape ::= (([\"\\/bfnrt]) | ("u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9])) (=(basic_string_sub))
22022207
basic_string_sub ::= (("\"") | ([^\0-\x1f\"\\\r\n] basic_string_sub) | ("\\" basic_escape basic_string_sub)) (=(basic_string_sub_1 [,}\]:]))
2203-
basic_string ::= (("\"" basic_string_sub)) (=(root_4 "}"))
2204-
root ::= (("{" root_1 "\"key\"" root_2 ":" root_3 basic_string root_4 "}"))
2205-
basic_string_sub_1 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t]))
2206-
root_1 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=("\"key\"" root_2 ":" root_3 basic_string root_4 "}"))
2207-
root_2 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=(":" root_3 basic_string root_4 "}"))
2208-
root_3 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=(basic_string root_4 "}"))
2209-
root_4 ::= ("" | ([ \n\t]) | ([ \n\t] [ \n\t])) (=("}"))
2208+
basic_string ::= (("\"" basic_string_sub)) (=(root_7 "}"))
2209+
root ::= (("{" root_1 "\"key\"" root_3 ":" root_5 basic_string root_7 "}"))
2210+
basic_string_sub_1 ::= ("" | ([ \n\t] basic_string_sub_2))
2211+
basic_string_sub_2 ::= ("" | ([ \n\t]))
2212+
root_1 ::= ("" | ([ \n\t] root_2)) (=("\"key\"" root_3 ":" root_5 basic_string root_7 "}"))
2213+
root_2 ::= ("" | ([ \n\t]))
2214+
root_3 ::= ("" | ([ \n\t] root_4)) (=(":" root_5 basic_string root_7 "}"))
2215+
root_4 ::= ("" | ([ \n\t]))
2216+
root_5 ::= ("" | ([ \n\t] root_6)) (=(basic_string root_7 "}"))
2217+
root_6 ::= ("" | ([ \n\t]))
2218+
root_7 ::= ("" | ([ \n\t] root_8)) (=("}"))
2219+
root_8 ::= ("" | ([ \n\t]))
22102220
"""
22112221
schema = {"type": "object", "properties": {"key": {"type": "string"}}, "required": ["key"]}
22122222
tokenizer_info = xgr.TokenizerInfo([])

0 commit comments

Comments
 (0)