@@ -39,13 +39,23 @@ using SchemaError = TypedError<SchemaErrorType>;
3939 */
4040class IndentManager {
4141 public:
42- IndentManager (std::optional<int > indent, const std::string& separator, bool any_whitespace)
42+ IndentManager (
43+ std::optional<int > indent,
44+ const std::string& separator,
45+ bool any_whitespace,
46+ std::optional<int > max_whitespace_cnt
47+ )
4348 : any_whitespace_(any_whitespace),
4449 enable_newline_ (indent.has_value()),
4550 indent_(indent.value_or(0 )),
4651 separator_(separator),
4752 total_indent_(0 ),
48- is_first_({true }) {}
53+ is_first_({true }),
54+ max_whitespace_cnt_(max_whitespace_cnt) {
55+ if (max_whitespace_cnt.has_value () && max_whitespace_cnt.value () <= 0 ) {
56+ XGRAMMAR_LOG (FATAL) << (" max_whitespace_cnt must be positive." );
57+ }
58+ }
4959
5060 /* ! \brief Enter a new indent level. */
5161 void StartIndent () {
@@ -104,12 +114,17 @@ class IndentManager {
104114 std::string separator_;
105115 int64_t total_indent_;
106116 std::vector<bool > is_first_;
117+ std::optional<int > max_whitespace_cnt_;
107118 friend class JSONSchemaConverter ;
108119};
109120
110121std::string IndentManager::StartSeparator () {
111122 if (any_whitespace_) {
112- return " [ \\ n\\ t]*" ;
123+ if (!max_whitespace_cnt_.has_value ()) {
124+ return " [ \\ n\\ t]*" ;
125+ } else {
126+ return " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
127+ }
113128 }
114129 if (!enable_newline_) {
115130 return " \"\" " ;
@@ -119,7 +134,13 @@ std::string IndentManager::StartSeparator() {
119134
120135std::string IndentManager::MiddleSeparator () {
121136 if (any_whitespace_) {
122- return " [ \\ n\\ t]* \" " + separator_ + " \" [ \\ n\\ t]*" ;
137+ std::string whitespace_part;
138+ if (!max_whitespace_cnt_.has_value ()) {
139+ whitespace_part = " [ \\ n\\ t]*" ;
140+ } else {
141+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
142+ }
143+ return whitespace_part + " \" " + separator_ + " \" " + whitespace_part;
123144 }
124145 if (!enable_newline_) {
125146 return " \" " + separator_ + " \" " ;
@@ -129,7 +150,11 @@ std::string IndentManager::MiddleSeparator() {
129150
130151std::string IndentManager::EndSeparator () {
131152 if (any_whitespace_) {
132- return " [ \\ n\\ t]*" ;
153+ if (!max_whitespace_cnt_.has_value ()) {
154+ return " [ \\ n\\ t]*" ;
155+ } else {
156+ return " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
157+ }
133158 }
134159 if (!enable_newline_) {
135160 return " \"\" " ;
@@ -139,7 +164,11 @@ std::string IndentManager::EndSeparator() {
139164
140165std::string IndentManager::EmptySeparator () {
141166 if (any_whitespace_) {
142- return " [ \\ n\\ t]*" ;
167+ if (!max_whitespace_cnt_.has_value ()) {
168+ return " [ \\ n\\ t]*" ;
169+ } else {
170+ return " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
171+ }
143172 }
144173 return " \"\" " ;
145174}
@@ -148,9 +177,19 @@ std::string IndentManager::NextSeparator(bool is_end) {
148177 if (any_whitespace_) {
149178 if (is_first_.back () || is_end) {
150179 is_first_.back () = false ;
151- return " [ \\ n\\ t]*" ;
180+ if (!max_whitespace_cnt_.has_value ()) {
181+ return " [ \\ n\\ t]*" ;
182+ } else {
183+ return " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
184+ }
152185 } else {
153- return " [ \\ n\\ t]* \" " + separator_ + " \" [ \\ n\\ t]*" ;
186+ std::string whitespace_part;
187+ if (!max_whitespace_cnt_.has_value ()) {
188+ whitespace_part = " [ \\ n\\ t]*" ;
189+ } else {
190+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
191+ }
192+ return whitespace_part + " \" " + separator_ + " \" " + whitespace_part;
154193 }
155194 }
156195
@@ -189,6 +228,7 @@ class JSONSchemaConverter {
189228 std::optional<int > indent,
190229 std::optional<std::pair<std::string, std::string>> separators,
191230 bool strict_mode,
231+ std::optional<int > max_whitespace_cnt = std::nullopt ,
192232 JSONFormat json_format = JSONFormat::kJSON
193233 );
194234
@@ -224,7 +264,6 @@ class JSONSchemaConverter {
224264 inline static const std::string kXMLEscape = " xml_escape" ;
225265 inline static const std::string kXMLString = " xml_string" ;
226266 inline static const std::string kXMLVariableName = " xml_variable_name" ;
227- inline static const std::string kWhiteSpace = " [ \\ n\\ t]*" ;
228267
229268 /* ! \brief Add the basic rules to the rules list and the basic_rules_cache. */
230269 void AddBasicRules (JSONFormat json_format);
@@ -517,6 +556,13 @@ class JSONSchemaConverter {
517556 bool any_whitespace_;
518557 // The cache for URI to rule. Mapping from the URI to the rule name.
519558 std::unordered_map<std::string, std::string> uri_to_rule_cache_;
559+ // The maximum number of whitespaces allowed when any_whitespace_ is true.
560+ std::optional<int > max_whitespace_cnt_;
561+
562+ const std::string kWhiteSpace =
563+ max_whitespace_cnt_.has_value()
564+ ? " [ \\ n\\ t]{0," + std::to_string(max_whitespace_cnt_.value()) + " }"
565+ : " [ \\ n\\ t]*" ;
520566};
521567
522568JSONSchemaConverter::JSONSchemaConverter (
@@ -525,9 +571,13 @@ JSONSchemaConverter::JSONSchemaConverter(
525571 std::optional<int > indent,
526572 std::optional<std::pair<std::string, std::string>> separators,
527573 bool strict_mode,
574+ std::optional<int > max_whitespace_cnt,
528575 JSONFormat json_format
529576)
530- : json_schema_(json_schema), strict_mode_(strict_mode), any_whitespace_(any_whitespace) {
577+ : json_schema_(json_schema),
578+ strict_mode_(strict_mode),
579+ any_whitespace_(any_whitespace),
580+ max_whitespace_cnt_(max_whitespace_cnt) {
531581 if (!separators.has_value ()) {
532582 if (indent == std::nullopt ) {
533583 separators = std::make_pair (" , " , " : " );
@@ -538,9 +588,15 @@ JSONSchemaConverter::JSONSchemaConverter(
538588 if (any_whitespace) {
539589 separators = std::make_pair (" ," , " :" );
540590 }
541- indentManager_ = IndentManager (indent, separators->first , any_whitespace);
591+ indentManager_ = IndentManager (indent, separators->first , any_whitespace, max_whitespace_cnt );
542592 if (any_whitespace) {
543- colon_pattern_ = " [ \\ n\\ t]* \" " + separators->second + " \" [ \\ n\\ t]*" ;
593+ std::string whitespace_part;
594+ if (!max_whitespace_cnt_.has_value ()) {
595+ whitespace_part = " [ \\ n\\ t]*" ;
596+ } else {
597+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
598+ }
599+ colon_pattern_ = whitespace_part + " \" " + separators->second + " \" " + whitespace_part;
544600 } else {
545601 colon_pattern_ = " \" " + separators->second + " \" " ;
546602 }
@@ -579,9 +635,9 @@ void JSONSchemaConverter::AddBasicRules(JSONFormat json_format) {
579635
580636 auto past_indent_manager = indentManager_;
581637 if (any_whitespace_) {
582- indentManager_ = IndentManager (std::nullopt , " ," , true );
638+ indentManager_ = IndentManager (std::nullopt , " ," , true , std:: nullopt );
583639 } else {
584- indentManager_ = IndentManager (std::nullopt , " , " , false );
640+ indentManager_ = IndentManager (std::nullopt , " , " , false , std:: nullopt );
585641 }
586642 AddJSONHelperRules ();
587643 if (json_format == JSONFormat::kXML ) {
@@ -628,14 +684,28 @@ void JSONSchemaConverter::AddJSONHelperRules() {
628684 ebnf_script_creator_.AddRule (
629685 kBasicEscape , " [\"\\\\ /bfnrt] | \" u\" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]"
630686 );
687+ std::string whitespace_part;
688+ if (!max_whitespace_cnt_.has_value ()) {
689+ whitespace_part = " [ \\ n\\ t]*" ;
690+ } else {
691+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
692+ }
631693 ebnf_script_creator_.AddRule (
632694 kBasicStringSub ,
633695 " (\"\\\"\" | [^\\ 0-\\ x1f\\\"\\\\\\ r\\ n] " + kBasicStringSub + " | \"\\\\\" " + kBasicEscape +
634- " " + kBasicStringSub + " ) (= [ \\ n \\ t]* [,}\\ ]:])"
696+ " " + kBasicStringSub + " ) (= " + whitespace_part + " [,}\\ ]:])"
635697 );
636698}
637699
638700void JSONSchemaConverter::AddXMLHelperRules () {
701+ std::string whitespace_part;
702+ if (any_whitespace_) {
703+ if (!max_whitespace_cnt_.has_value ()) {
704+ whitespace_part = " [ \\ n\\ t]*" ;
705+ } else {
706+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (max_whitespace_cnt_.value ()) + " }" ;
707+ }
708+ }
639709 ebnf_script_creator_.AddRule (
640710 kXMLEscape , " [\"\\\\ /bfnrt] | \" u\" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]"
641711 );
@@ -645,7 +715,7 @@ void JSONSchemaConverter::AddXMLHelperRules() {
645715 ebnf_script_creator_.AddRule (
646716 kXMLString ,
647717 " (\"\" | [^<>&\\ 0-\\ x1f\\\\\\ r\\ n] " + kXMLString + " | \"\\\\\" " + kXMLEscape + " " +
648- kXMLString + " | " + kXMLEntity + " " + kXMLString + " ) (= [ \\ n \\ t]* )"
718+ kXMLString + " | " + kXMLEntity + " " + kXMLString + " ) (= " + whitespace_part + " )"
649719 );
650720 ebnf_script_creator_.AddRule (kXMLVariableName , " [a-zA-Z_] [a-zA-Z0-9_]*" );
651721}
@@ -3278,7 +3348,13 @@ std::string JSONSchemaConverter::VisitObject(
32783348 result += " \" }\" " ;
32793349 if (could_be_empty) {
32803350 // result = (result) | {}
3281- auto rest = " \" {\" " + std::string (any_whitespace_ ? " [ \\ n\\ t]* " : " " ) + " \" }\" " ;
3351+ std::string whitespace_part;
3352+ if (max_whitespace_cnt_ == std::nullopt ) {
3353+ whitespace_part = " [ \\ n\\ t]* " ;
3354+ } else {
3355+ whitespace_part = " [ \\ n\\ t]{0," + std::to_string (*max_whitespace_cnt_) + " } " ;
3356+ }
3357+ auto rest = " \" {\" " + std::string (any_whitespace_ ? whitespace_part : " " ) + " \" }\" " ;
32823358 if (result == " \" {\" \" }\" " ) {
32833359 result = rest;
32843360 } else {
@@ -3329,14 +3405,15 @@ std::string JSONSchemaToEBNF(
33293405 std::optional<int > indent,
33303406 std::optional<std::pair<std::string, std::string>> separators,
33313407 bool strict_mode,
3408+ std::optional<int > max_whitespace_cnt,
33323409 JSONFormat json_format
33333410) {
33343411 picojson::value schema_value;
33353412 std::string err = picojson::parse (schema_value, schema);
33363413 XGRAMMAR_CHECK (err.empty ()) << " Failed to parse JSON: " << err
33373414 << " . The JSON string is:" << schema;
33383415 return JSONSchemaToEBNF (
3339- schema_value, any_whitespace, indent, separators, strict_mode, json_format
3416+ schema_value, any_whitespace, indent, separators, strict_mode, max_whitespace_cnt, json_format
33403417 );
33413418}
33423419
@@ -3346,10 +3423,11 @@ std::string JSONSchemaToEBNF(
33463423 std::optional<int > indent,
33473424 std::optional<std::pair<std::string, std::string>> separators,
33483425 bool strict_mode,
3426+ std::optional<int > max_whitespace_cnt,
33493427 JSONFormat json_format
33503428) {
33513429 JSONSchemaConverter converter (
3352- schema, any_whitespace, indent, separators, strict_mode, json_format
3430+ schema, any_whitespace, indent, separators, strict_mode, max_whitespace_cnt, json_format
33533431 );
33543432 return converter.Convert (json_format);
33553433}
0 commit comments