Skip to content

Commit 88b3394

Browse files
committed
Add hugging face tokenizer
1 parent 8cbe6cd commit 88b3394

File tree

2 files changed

+25
-8
lines changed

2 files changed

+25
-8
lines changed

examples/models/llama/runner/generation.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ def generate( # noqa: C901
102102
)
103103

104104
current_token = next_token(logits, temperature, top_p)
105-
print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
105+
# print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
106+
print(f"{self.tokenizer.decode([current_token])}", end="", flush=True)
106107
tokens = prompt_tokens + [current_token]
107108

108109
while len(tokens) < max_seq_len:
@@ -132,7 +133,8 @@ def generate( # noqa: C901
132133
):
133134
break
134135

135-
print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
136+
# print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
137+
print(f"{self.tokenizer.decode([current_token])}", end="", flush=True)
136138
print("\n")
137139

138140
return tokens if echo else tokens[len(prompt_tokens) :]
@@ -160,7 +162,8 @@ def text_completion(
160162
This method generates text completion for the provided prompt, employing nucleus sampling to introduce controlled randomness.
161163
"""
162164
return self.generate(
163-
prompt_tokens=self.tokenizer.encode(prompt, bos=True, eos=False),
165+
# prompt_tokens=self.tokenizer.encode(prompt, bos=True, eos=False),
166+
prompt_tokens=self.tokenizer.encode(prompt).ids,
164167
max_seq_len=self.max_seq_len,
165168
temperature=temperature,
166169
top_p=top_p,
@@ -194,9 +197,12 @@ def chat_completion(
194197
prompt = input("Me: ")
195198
while prompt and prompt != exit_prompt:
196199
print("LLM: ", end="", flush=True)
200+
# prompt_tokens = self.tokenizer.encode(
201+
# self._format_prompt(prompt), bos=True, eos=False
202+
# )
197203
prompt_tokens = self.tokenizer.encode(
198-
self._format_prompt(prompt), bos=True, eos=False
199-
)
204+
self._format_prompt(prompt)
205+
).ids
200206
generated_tokens = self.generate(
201207
prompt_tokens=pre_stop_token + prompt_tokens,
202208
max_seq_len=max_seq_len,

extension/llm/tokenizer/utils.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,20 @@
1313

1414
def get_tokenizer(tokenizer_path):
1515
if tokenizer_path.endswith(".json"):
16-
print("Using Hugging Face tokenizer")
17-
tokenizer = HFTokenizer()
18-
tokenizer.load(tokenizer_path)
16+
# print("Using Hugging Face tokenizer")
17+
# tokenizer = HFTokenizer()
18+
# tokenizer.load(tokenizer_path)
19+
20+
from tokenizers import Tokenizer
21+
22+
# Load the tokenizer from the tokenizer.json file
23+
tokenizer = Tokenizer.from_file(tokenizer_path)
24+
25+
# from tokenizers import SentencePieceBPETokenizer
26+
27+
# tokenizer = SentencePieceBPETokenizer(tokenizer_path)
28+
tokenizer.n_words = tokenizer.get_vocab_size()
29+
breakpoint()
1930
else:
2031
try:
2132
tokenizer = SentencePieceTokenizer(model_path=str(tokenizer_path))

0 commit comments

Comments
 (0)