Expose grammar match() method

slundberg · slundberg · commit 4da011f30483 · 2024-01-26T21:46:52.000Z
diff --git a/guidance/_grammar.py b/guidance/_grammar.py
@@ -5,6 +5,7 @@
 import types
 import re
 from . import _serialization_pb2
+from . import _parser
 
 tag_start = "{{G|"
 tag_end = "|G}}"
@@ -83,6 +84,27 @@ def __radd__(model):
             return self(model)
         return RawFunction(__radd__, [], {})
 
+class Match:
+    def __init__(self, captures, log_probs, partial):
+        self.captures = captures
+        self.log_probs = log_probs
+        self.partial = partial
+    
+    def __getitem__(self, key):
+        return self.captures[key]
+    
+    def __len__(self):
+        return len(self.captures)
+    
+    def __bool__(self):
+        return True
+    
+    def __str__(self):
+        return str(self.captures)
+    
+    def __repr__(self):
+        return "<guidance.Match object; captures="+str(self.captures)+"; partial="+str(self.partial)+">"
+
 class GrammarFunction(Function):
     num_used_names = 0
 
@@ -123,6 +145,22 @@ def __radd__(self, value):
     def __getitem__(self, value):
         raise StatefulException("GrammarFunctions can't access state!")
     
+    def match(self, byte_string, allow_partial=False):
+        if isinstance(byte_string, str):
+            byte_string = byte_string.encode()
+        parser = _parser.EarleyCommitParser(self)
+
+        for i in range(len(byte_string)):
+            try:
+                parser.consume_byte(byte_string[i:i+1])
+            except:
+                return None
+        
+        if not allow_partial and not parser.matched():
+            return None
+        else:
+            return Match(*parser.get_captures(), partial=not parser.matched())
+    
     @staticmethod
     def _new_name():
         num_used = GrammarFunction.num_used_names
diff --git a/guidance/models/_mock.py b/guidance/models/_mock.py
@@ -4,7 +4,7 @@
 from ._remote import RemoteEngine
 
 class MockEngine(Engine):
-    def __init__(self, tokenizer, byte_patterns, compute_log_probs):
+    def __init__(self, tokenizer, byte_patterns, compute_log_probs, force):
         super().__init__(tokenizer, compute_log_probs=compute_log_probs)
 
         self._valid_mask = np.zeros(len(tokenizer.tokens))
@@ -14,6 +14,7 @@ def __init__(self, tokenizer, byte_patterns, compute_log_probs):
                 self._valid_mask[i] = 1.0
             except:
                 pass
+        self.force = force
 
         # allow a single byte pattern to be passed
         if isinstance(byte_patterns, (bytes, str)):
@@ -23,8 +24,10 @@ def __init__(self, tokenizer, byte_patterns, compute_log_probs):
         for i,pattern in enumerate(byte_patterns):
             if isinstance(pattern, str):
                 byte_patterns[i] = pattern.encode("utf8")
-
+        
         self.byte_patterns = byte_patterns
+
+        # seed the random number generator
         self._rand_generator = np.random.default_rng(seed=42)
 
     def get_logits(self, token_ids, forced_bytes, current_temp):
@@ -34,8 +37,13 @@ def get_logits(self, token_ids, forced_bytes, current_temp):
         # build the byte strings
         byte_string = b"".join(self.tokenizer.tokens[i] for i in token_ids)
 
-        # we randomly generate valid unicode bytes
-        logits = self._rand_generator.standard_normal(len(self.tokenizer.tokens)) * self._valid_mask
+        # if we are forcing the bytes patterns then don't allow other tokens
+        if self.force:
+            logits = np.ones(len(self.tokenizer.tokens)) * -np.inf
+        
+        # otherwise we randomly generate valid unicode bytes
+        else:
+            logits = self._rand_generator.standard_normal(len(self.tokenizer.tokens)) * self._valid_mask
 
         # if we have a pattern that matches then force the next token
         bias = 100.0
@@ -55,7 +63,7 @@ def _get_next_tokens(self, byte_string):
                 yield i
 
 class Mock(Model):
-    def __init__(self, byte_patterns=[], echo=True, compute_log_probs=False, **kwargs):
+    def __init__(self, byte_patterns=[], echo=True, compute_log_probs=False, force=False, **kwargs):
         '''Build a new Mock model object that represents a model in a given state.'''
 
         if isinstance(byte_patterns, str) and byte_patterns.startswith("http"):
@@ -67,7 +75,7 @@ def __init__(self, byte_patterns=[], echo=True, compute_log_probs=False, **kwarg
                 0,
                 0
             )
-            engine = MockEngine(tokenizer, byte_patterns, compute_log_probs)
+            engine = MockEngine(tokenizer, byte_patterns, compute_log_probs, force)
         
         
         super().__init__(
diff --git a/guidance/models/_model.py b/guidance/models/_model.py
@@ -311,6 +311,7 @@ def __call__(self, parser, grammar, ensure_bos_token=True):
             # if we walked all the way to a forced token then we advance without computing the logits
             # we are forced if there are no more options and we are either in the middle of the grammar or at a trie leaf
             is_forced = next_byte_mask_sum <= 1 and (len(trie) == 0 if parser.matched() else trie != self._token_trie)
+            token_pos = 0
             if is_forced:
                 sampled_token_ind = trie.value
                 sampled_token = self.tokenizer.tokens[sampled_token_ind]
@@ -319,7 +320,6 @@ def __call__(self, parser, grammar, ensure_bos_token=True):
 
             # we are at the end of the grammar
             elif next_byte_mask_sum == 0:
-                token_pos = 0
 
                 # mark the token we "sampled" if we have comsumed some bytes
                 if trie != self._token_trie: