Skip to content

Commit 4187419

Browse files
committed
Fix #609 to ensure token consistency
1 parent 513966f commit 4187419

File tree

2 files changed

+18
-7
lines changed

2 files changed

+18
-7
lines changed

guidance/_grammar.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,29 @@
77
from . import _serialization_pb2
88
from . import _parser
99

10-
tag_start = "{{G|"
11-
tag_end = "|G}}"
12-
_call_pool = {}
13-
_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end))
10+
11+
# to support the embedding of guidance functions inside Python f-strings we use tags with these delimiters
12+
tag_start = "{{G|" # start of a call tag
13+
tag_end = "|G}}" # end of a call tag
14+
_call_pool = {} # the functions associated with the call tags
15+
_tag_pattern = re.compile(re.escape(tag_start) + r"([^\|]+)" + re.escape(tag_end)) # the pattern for matching call tags
1416

1517
class StatefulException(Exception):
1618
'''This is raised when we try and use the state of a grammar object like it was a live model.
1719
18-
Note that eventually we do want to support stateful parser/grammar constructs directly, but
19-
for now we use a traditional parser and grammar separation (hence the need for this exception).'''
20+
Note that eventually it would be nice to support stateful parser/grammar constructs directly, but
21+
such "parser combinators" cannot be run effciently in Python. So we use a traditional parser and
22+
grammar separation (hence the need for this exception).'''
2023
pass
2124

2225
class Function():
26+
''' This is the abstract class representing all guidance functions.
27+
28+
There are two main subclasses: GrammarFunction and RawFunction. GrammarFunctions
29+
represent guidance grammars that can be serialized and sent across the wire, while
30+
RawFunctions represent unconstrained native Python functions.
31+
'''
32+
2333
def __init__(self, name, value=None) -> None:
2434
self.name = name
2535
self.value = value

guidance/models/transformers/_transformers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ def _model_and_tokenizer(self, model, tokenizer, **kwargs):
9393
return model, tokenizer
9494

9595
def _joint_tokenize(self, token_ids):
96-
first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
96+
# first_decode = self.tokenizer._orig_tokenizer.decode(token_ids)
97+
first_decode = b''.join([self.tokenizer.tokens[id] for id in token_ids]).decode("utf8")
9798
new_ids = self.tokenizer._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"]
9899

99100
# HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle)

0 commit comments

Comments
 (0)