[Fix] Fix lookahead assertion analyzer. (#401)

Seven-Streams · web-flow · commit 853c7acdb5ac · 2025-08-30T10:15:03.000+08:00
This PR fixes the issue reported in #400, i.e. the incorrect behavior when the lookahead assertion is nullable. Moreover, this PR adds exact lookahead to improve the efficiency at runtime. --------- Signed-off-by: Yuchuan <blemiade_qinchuan@sjtu.edu.cn>
diff --git a/cpp/grammar_builder.h b/cpp/grammar_builder.h
@@ -279,6 +279,12 @@ class GrammarBuilder {
     grammar_->rules_[rule_id].lookahead_assertion_id = lookahead_assertion_id;
   }
 
+  void UpdateLookaheadExact(int32_t rule_id, bool is_exact = true) {
+    XGRAMMAR_CHECK(rule_id < static_cast<int32_t>(grammar_->rules_.size()))
+        << "Rule id " << rule_id << " is out of range.";
+    grammar_->rules_[rule_id].is_exact_lookahead = is_exact;
+  }
+
   /*!
    * \brief Add a lookahead assertion to a rule referred by the given name. The lookahead
    * assertion should be a sequence GrammarExpr id. An id of -1 means no lookahead assertion.
diff --git a/cpp/grammar_compiler.cc b/cpp/grammar_compiler.cc
@@ -114,14 +114,21 @@ class GrammarMatcherForTokenMaskCache : public EarleyParser {
 std::pair<bool, bool> GrammarMatcherForTokenMaskCache::IsTokenPassLookaheadAssertion(
     const std::string& token, const std::vector<bool>& can_reach_end_stack
 ) {
+  bool accepted = true;
+  bool can_reach_end = true;
   auto lookahead_assertion_id = grammar_->GetRule(init_rule_id).lookahead_assertion_id;
   if (lookahead_assertion_id == -1) {
-    return {true, true};
+    return {accepted, can_reach_end};
   }
   auto lookahead_state =
       ParserState(/*rule_id*/ -1, lookahead_assertion_id, 0, ParserState::kNoPrevInputPos, 0);
   PushStateAndExpand(lookahead_state);
   int token_len = token.size();
+  if (IsCompleted()) {
+    // If the lookahead assertion is already completed, we can accept the token.
+    PopLastStates(1);
+    return {accepted, can_reach_end};
+  }
 
   // Find all positions that can come to and end. Then check if the suffix from that position
   // can be accepted by the lookahead assertion.
@@ -140,20 +147,23 @@ std::pair<bool, bool> GrammarMatcherForTokenMaskCache::IsTokenPassLookaheadAsser
         // accepted chars: pos - i + 1
         // we need to rollback the pushed initial state as well
         PopLastStates(pos - i + 2);
-        return {true, true};
+        return {accepted, can_reach_end};
       }
     }
     // Case 2. The whole token is accepted
     if (last_accept_pos == token_len - 1) {
       PopLastStates(last_accept_pos - i + 2);
-      return {true, false};
+      can_reach_end = false;
+      return {accepted, can_reach_end};
     }
     // Case 3. The token is not accepted. Check the next position.
     PopLastStates(last_accept_pos - i + 1);
   }
 
   PopLastStates(1);
-  return {false, false};
+  can_reach_end = false;
+  accepted = false;
+  return {accepted, can_reach_end};
 }
 
 // Comparator for std::pair<int32_t, std::string> based on the string value.
@@ -322,6 +332,7 @@ bool GrammarMatcherForTokenMaskCache::GetTokenMaskWithFirstCharacterCheck(
 
   int prev_matched_size = 0;
   int last_rejected_range = 0;
+  const bool& is_exact_lookahead = grammar_->GetRule(init_rule_id).is_exact_lookahead;
   const std::string* prev_token = nullptr;
   for (size_t interval_idx = 0; interval_idx < possible_intervals.size(); ++interval_idx) {
     const auto& interval = possible_intervals[interval_idx];
@@ -414,10 +425,7 @@ bool GrammarMatcherForTokenMaskCache::GetTokenMaskWithFirstCharacterCheck(
           // 1. If the current rule is the root rule (is_root_rule=true), there are no
           // uncertain tokens. Not accepted tokens are just rejected.
           // 2. If a token cannot pass the lookahead assertion, it is rejected.
-          if ((!lookahead_result_pair.second) &&
-              (std::binary_search(
-                  grammar_->exact_lookahead.begin(), grammar_->exact_lookahead.end(), init_rule_id
-              ))) {
+          if ((!lookahead_result_pair.second) && is_exact_lookahead) {
             tmp_accepted_indices_.push_back(i);
           } else {
             tmp_uncertain_indices_.push_back(i);
diff --git a/cpp/grammar_functor.cc b/cpp/grammar_functor.cc
@@ -576,17 +576,63 @@ class LookaheadAssertionAnalyzerImpl : public GrammarMutator {
     }
     for (int i = 0; i < static_cast<int>(grammar->NumRules()); ++i) {
       auto rule = grammar->GetRule(i);
-      if (i == grammar->GetRootRuleId() || rule.lookahead_assertion_id != -1) {
+      if (i == grammar->GetRootRuleId()) {
+        continue;
+      }
+      if (rule.lookahead_assertion_id != -1) {
+        builder_->UpdateLookaheadExact(i, IsExactLookaheadAssertion(i));
         continue;
       }
       auto look_head_assertion_id = DetectLookaheadAssertion(i);
       if (look_head_assertion_id != -1) {
         builder_->UpdateLookaheadAssertion(i, look_head_assertion_id);
+        builder_->UpdateLookaheadExact(i);
       }
     }
     return builder_->Get(grammar->GetRootRuleId());
   }
 
+  bool IsExactLookaheadAssertion(int32_t rule_id) {
+    XGRAMMAR_DCHECK(base_grammar_->GetRule(rule_id).lookahead_assertion_id != -1);
+    bool found = false;
+    for (int i = 0; i < static_cast<int>(base_grammar_->NumRules()); ++i) {
+      auto rule = base_grammar_->GetRule(i);
+      auto grammar_expr = base_grammar_->GetGrammarExpr(rule.body_expr_id);
+      if (grammar_expr.type == GrammarExprType::kTagDispatch) {
+        for (int j = 1; j < grammar_expr.size() - 3; j += 2) {
+          if (grammar_expr[j] == rule_id) {
+            return false;
+          }
+        }
+        continue;
+      }
+      XGRAMMAR_DCHECK(grammar_expr.type == GrammarExprType::kChoices);
+      for (auto sequence_id : grammar_expr) {
+        auto sequence_expr = base_grammar_->GetGrammarExpr(sequence_id);
+        if (sequence_expr.type != GrammarExprType::kSequence) {
+          continue;
+        }
+        auto last_element = base_grammar_->GetGrammarExpr(sequence_expr.end()[-1]);
+        if (last_element.type == GrammarExprType::kRuleRef && last_element[0] == rule_id &&
+            i != rule_id) {
+          return false;
+        }
+
+        for (int j = 0; j < sequence_expr.size() - 1; ++j) {
+          auto element_expr = base_grammar_->GetGrammarExpr(sequence_expr[j]);
+          if (element_expr.type != GrammarExprType::kRuleRef || element_expr[0] != rule_id) {
+            continue;
+          }
+          if (found) {
+            return false;
+          }
+          found = true;
+        }
+      }
+    }
+    return found;
+  }
+
   int32_t DetectLookaheadAssertion(int32_t rule_id) {
     std::vector<int32_t> found_sequence;  // Element ids
     bool found = false;
@@ -1648,7 +1694,7 @@ class RepetitionNormalizerImpl {
         continue;
       }
       int repeat_rule_id = expr[0];
-      (*grammar)->exact_lookahead.push_back(repeat_rule_id);
+      grammar->ImplPtr()->GetRule(repeat_rule_id).is_exact_lookahead = true;
       if (std::binary_search(
               (*grammar)->allow_empty_rule_ids.begin(),
               (*grammar)->allow_empty_rule_ids.end(),
@@ -1658,7 +1704,6 @@ class RepetitionNormalizerImpl {
         expr.SetData(1, 0);  // Set min repeat to 0
       }
     }
-    std::sort((*grammar)->exact_lookahead.begin(), (*grammar)->exact_lookahead.end());
   }
 };
 
diff --git a/cpp/grammar_impl.h b/cpp/grammar_impl.h
@@ -78,6 +78,8 @@ class Grammar::Impl {
     /*! \brief The id of the associated lookahead assertion expr. For now it must be a id of a
      * sequence GrammarExpr. -1 if not exists. */
     int32_t lookahead_assertion_id = -1;
+    /*! \brief Whether the lookahead assertion is exact. */
+    bool is_exact_lookahead = false;
   };
 
   /*! \brief Get the number of rules. */
@@ -88,6 +90,11 @@ class Grammar::Impl {
         << "rule_id " << rule_id << " is out of bound";
     return rules_[rule_id];
   }
+  Rule& GetRule(int32_t rule_id) {
+    XGRAMMAR_DCHECK(rule_id >= 0 && rule_id < static_cast<int32_t>(rules_.size()))
+        << "rule_id " << rule_id << " is out of bound";
+    return rules_[rule_id];
+  }
   /*! \brief Get the root rule id of the grammar. */
   int32_t GetRootRuleId() const { return root_rule_id_; }
   /*! \brief Get the root rule of the grammar. */
@@ -250,9 +257,6 @@ class Grammar::Impl {
   /*! \brief The ids of the rules that are allowed to be empty. */
   std::vector<int32_t> allow_empty_rule_ids;
 
-  /*! \brief Store the lookahead which are exact, used to reduce uncertainty.*/
-  std::vector<int32_t> exact_lookahead;
-
   friend class GrammarBuilder;
   friend class GrammarCompiler;
 
@@ -264,7 +268,8 @@ XGRAMMAR_MEMBER_ARRAY(
     Grammar::Impl::Rule,
     &Grammar::Impl::Rule::name,
     &Grammar::Impl::Rule::body_expr_id,
-    &Grammar::Impl::Rule::lookahead_assertion_id
+    &Grammar::Impl::Rule::lookahead_assertion_id,
+    &Grammar::Impl::Rule::is_exact_lookahead
 );
 
 XGRAMMAR_MEMBER_TABLE(
@@ -282,9 +287,7 @@ XGRAMMAR_MEMBER_TABLE(
     "per_rule_fsms",
     &Grammar::Impl::per_rule_fsms,
     "allow_empty_rule_ids",
-    &Grammar::Impl::allow_empty_rule_ids,
-    "exact_lookahead",
-    &Grammar::Impl::exact_lookahead
+    &Grammar::Impl::allow_empty_rule_ids
 );
 
 }  // namespace xgrammar
diff --git a/cpp/support/json_serializer.h b/cpp/support/json_serializer.h
@@ -62,7 +62,7 @@ class SerializeVersion {
    * \brief The current serialization version. When the serialization result of any object in
    * XGrammar is changed, this version should be bumped.
    */
-  static constexpr const char kXGrammarSerializeVersion[] = "v4";
+  static constexpr const char kXGrammarSerializeVersion[] = "v5";
 };
 
 /*!
diff --git a/tests/python/test_grammar_matcher_regex.py b/tests/python/test_grammar_matcher_regex.py
@@ -2,6 +2,7 @@
 import time
 
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 import xgrammar as xgr
@@ -173,5 +174,37 @@ def test_regex_with_large_range_compilation():
     print(f"Time to compile regex with large range: {(time_end - time_start) / 1e3} us")
 
 
+@pytest.mark.hf_token_required
+def test_regression_lookahead_already_completed():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+    tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
+    xgr_compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=1)
+    compiled_grammar = xgr_compiler.compile_regex(r"\/\*(\*+[^*\/]|[^*])*\*+\/")
+    matcher = xgr.GrammarMatcher(compiled_grammar)
+
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
+
+    def process_logit(input_ids: list, logit: torch.Tensor) -> torch.Tensor:
+        if input_ids:
+            last_token = input_ids[-1]
+            assert matcher.accept_token(last_token)
+        matcher.fill_next_token_bitmask(token_bitmask)
+        xgr.apply_token_bitmask_inplace(logit, token_bitmask)
+        return logit
+
+    def process_tokens(tokens: list):
+        for i in range(len(tokens)):
+            logit = torch.zeros((tokenizer_info.vocab_size,), dtype=torch.float)
+            visible_tokens = tokens[:i]
+            masked_logit = process_logit(visible_tokens, logit)
+            assert masked_logit[tokens[i]] != float(
+                "-inf"
+            ), f"token {i} ({tokens[i]}, {tokenizer.decode(tokens[i])!r}) is masked"
+
+    text = "/*  */"
+    tokens = tokenizer.encode(text)
+    process_tokens(tokens)
+
+
 if __name__ == "__main__":
     pytest.main(sys.argv)
diff --git a/tests/python/test_serialization.py b/tests/python/test_serialization.py
@@ -41,15 +41,15 @@ def construct_compiled_grammar():
 
 def test_get_serialization_version():
     """Test the version of the serialized JSON string."""
-    assert xgr.get_serialization_version() == "v4"
+    assert xgr.get_serialization_version() == "v5"
 
 
 def test_serialize_grammar():
     """Test Grammar serialization produces expected JSON string."""
     grammar = construct_grammar()
     serialized = grammar.serialize_json()
     expected_json = {
-        "rules": [["rule1", 4, 9], ["root_rule", 8, -1]],
+        "rules": [["rule1", 4, 9, True], ["root_rule", 8, -1, False]],
         "grammar_expr_data": [0, 2, 7, 10, 14, 18, 21, 24, 28, 31],
         "grammar_expr_indptr": [
             # fmt: off
@@ -60,8 +60,7 @@ def test_serialize_grammar():
         "complete_fsm": None,
         "per_rule_fsms": [],
         "allow_empty_rule_ids": [],
-        "exact_lookahead": [],
-        "__VERSION__": "v4",
+        "__VERSION__": "v5",
     }
     # The fsms are the same one, but the start state and end states are different.
     assert json.loads(serialized) == expected_json
@@ -70,7 +69,7 @@ def test_serialize_grammar():
 def test_serialize_grammar_exception():
     """Test Grammar serialization produces expected JSON string."""
     expected_json = {
-        "rules": [["rule1", 4, 9], ["root_rule", 8, -1]],
+        "rules": [["rule1", 4, 9, True], ["root_rule", 8, -1, False]],
         "grammar_expr_data": [0, 2, 7, 10, 14, 18, 21, 24, 28, 31],
         "grammar_expr_indptr": [
             # fmt: off
@@ -81,15 +80,14 @@ def test_serialize_grammar_exception():
         "allow_empty_rule_ids": [],
         "complete_fsm": None,
         "per_rule_fsms": [],
-        "exact_lookahead": [],
-        "__VERSION__": "v4",
+        "__VERSION__": "v5",
     }
 
     expected_json["__VERSION__"] = "v1"  # Change version to trigger error
     with pytest.raises(xgr.DeserializeVersionError):
         xgr.Grammar.deserialize_json(json.dumps(expected_json))
 
-    expected_json["__VERSION__"] = "v4"
+    expected_json["__VERSION__"] = "v5"
     expected_json.pop("rules")  # Remove required field to trigger error
     with pytest.raises(xgr.DeserializeFormatError):
         xgr.Grammar.deserialize_json(json.dumps(expected_json))
@@ -141,7 +139,7 @@ def test_serialize_tokenizer_info():
         '"decoded_vocab":["1","212","a","A","b","\\u00e4\\u00b8\\u0080","-","aBc","abc"],'
         '"sorted_decoded_vocab":[[6,"-"],[3,"A"],[2,"a"],[7,"aBc"],[8,"abc"],[4,"b"],[5,"\\u00e4\\u00b8\\u0080"]],'
         '"trie_subtree_nodes_range":[1,2,5,4,5,6,7],'
-        '"__VERSION__":"v4"}'
+        '"__VERSION__":"v5"}'
     )
     assert json.loads(serialized) == json.loads(expected_json)
 
@@ -195,7 +193,7 @@ def test_serialize_compiled_grammar():
 
     expected_json = {
         "grammar": {
-            "rules": [["rule1", 4, 6], ["root_rule", 10, -1]],
+            "rules": [["rule1", 4, 6, True], ["root_rule", 10, -1, False]],
             "grammar_expr_data": [0, 2, 7, 10, 14, 18, 21, 24, 27, 30, 34],
             "grammar_expr_indptr": [
                 # fmt: off
@@ -215,15 +213,14 @@ def test_serialize_compiled_grammar():
                 [{'data_': [[0, 47, 3], [58, 127, 3], [192, 223, 1], [224, 239, 4], [240, 247, 5], [128, 191, 3], [-2, 0, 2], [128, 191, 1], [128, 191, 4], [-2, 0, 8], [97, 97, 6]],
                 'indptr_': [0, 5, 6, 6, 7, 8, 9, 9, 10, 11]}, 7, [6], False]],
             # fmt: on
-            "exact_lookahead": [],
         },
         "tokenizer_metadata": {
             "vocab_type": 1,
             "vocab_size": 10,
             "add_prefix_space": True,
             "stop_token_ids": [0, 1],
         },
-        "__VERSION__": "v4",
+        "__VERSION__": "v5",
     }
 
     class AdaptiveTokenMask(BaseModel):