Fix #609 to ensure token consistency

slundberg · slundberg · commit 4187419b4c4d · 2024-01-31T22:51:10.000Z
diff --git a/guidance/_grammar.py b/guidance/_grammar.py
@@ -7,19 +7,29 @@
 from . import _serialization_pb2
 from . import _parser
 
-tag_start = "{{G|"
-tag_end = "|G}}"
-_call_pool = {}
-_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end))
+
+# to support the embedding of guidance functions inside Python f-strings we use tags with these delimiters
+tag_start = "{{G|" # start of a call tag
+tag_end = "|G}}" # end of a call tag
+_call_pool = {} # the functions associated with the call tags
+_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end)) # the pattern for matching call tags
 
 class StatefulException(Exception):
     '''This is raised when we try and use the state of a grammar object like it was a live model.
     
-    Note that eventually we do want to support stateful parser/grammar constructs directly, but
-    for now we use a traditional parser and grammar separation (hence the need for this exception).'''
+    Note that eventually it would be nice to support stateful parser/grammar constructs directly, but
+    such "parser combinators" cannot be run effciently in Python. So we use a traditional parser and
+    grammar separation (hence the need for this exception).'''
     pass
 
 class Function():
+    ''' This is the abstract class representing all guidance functions.
+    
+    There are two main subclasses: GrammarFunction and RawFunction. GrammarFunctions
+    represent guidance grammars that can be serialized and sent across the wire, while
+    RawFunctions represent unconstrained native Python functions.
+    '''
+    
     def __init__(self, name, value=None) -> None:
         self.name = name
         self.value = value
diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py
@@ -93,7 +93,8 @@ def _model_and_tokenizer(self, model, tokenizer, **kwargs):
         return model, tokenizer
 
     def _joint_tokenize(self, token_ids):
-        first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
+        # first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
+        first_decode = b''.join([self.tokenizer.tokens[id] for id in token_ids]).decode("utf8")
         new_ids = self.tokenizer._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"]
 
         # HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle)