Skip to content

Commit 1a1e448

Browse files
author
Marcin Kardas
committed
Disable cleaning of spaces
Now the outputs match the ones before AutoTokenizer was introduced.
1 parent 7ee7c97 commit 1a1e448

File tree

2 files changed

+11
-3
lines changed

2 files changed

+11
-3
lines changed

galai/model.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,11 @@ def generate(
306306
if not return_full_text:
307307
out_tokens = out_tokens[:, input_v.shape[1]:]
308308
# we keep special tokens such as [START_REF] or <work>
309-
decoded = self.tokenizer.batch_decode(out_tokens, skip_special_tokens=False)
309+
decoded = self.tokenizer.batch_decode(
310+
out_tokens,
311+
skip_special_tokens=False,
312+
clean_up_tokenization_spaces=False,
313+
)
310314
# so we manually remove </s> and <pad>
311315
decoded = [
312316
text.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
@@ -431,7 +435,11 @@ def generate_reference(
431435
)
432436
# cut-off the prompts
433437
generated_tokens = out["sequences"][:, prompt_length:]
434-
decoded = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
438+
decoded = self.tokenizer.batch_decode(
439+
generated_tokens,
440+
skip_special_tokens=False,
441+
clean_up_tokenization_spaces=False,
442+
)
435443
references = []
436444
unfinished_generation = False
437445
for text in decoded:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup, find_packages
22

33
PACKAGE_NAME = 'galai'
4-
VERSION = "1.1.5"
4+
VERSION = "1.1.6"
55
DESCRIPTION = "API for the GALACTICA model"
66
KEYWORDS = "Scientific Intelligence"
77
URL = 'https://github.com/paperswithcode/galai'

0 commit comments

Comments
 (0)