Skip to content

Commit f079e6f

Browse files
committed
Fix #555 (patch huggingface bug)
1 parent 6b0cabd commit f079e6f

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

guidance/models/transformers/_transformers.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,15 @@ def __init__(self, model=None, tokenizer=None, echo=True, caching=True, temperat
7171
self._cache_state["cache_token_ids"] = []
7272

7373
def _joint_tokenize(self, token_ids):
74-
return self._orig_tokenizer(self._orig_tokenizer.decode(token_ids), add_special_tokens=False)["input_ids"]
74+
first_decode = self._orig_tokenizer.decode(token_ids)
75+
new_ids = self._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"]
76+
77+
# HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle)
78+
second_decode = self._orig_tokenizer.decode(new_ids)
79+
if second_decode != first_decode and len(second_decode) == len(first_decode) + 1 and second_decode.startswith("<s> "):
80+
new_ids = new_ids[0:1] + new_ids[2:]
81+
82+
return new_ids
7583

7684
def _model_and_tokenizer(self, model, tokenizer, **kwargs):
7785

0 commit comments

Comments
 (0)