Skip to content

Commit a7b9512

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add sentencepiece tokenizer support to llm runner (#11645)
Summary: X-link: meta-pytorch/tokenizers#85 Fixes #11618 Add sentencepiece tokenizer support Pull Request resolved: #11645 Reviewed By: guangy10 Differential Revision: D76789606 Pulled By: larryliu0820
1 parent a6d8440 commit a7b9512

File tree

3 files changed

+9
-2
lines changed

3 files changed

+9
-2
lines changed

extension/llm/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def define_common_targets():
103103
":text_token_generator" + aten_suffix,
104104
"//pytorch/tokenizers:hf_tokenizer",
105105
"//pytorch/tokenizers:llama2c_tokenizer",
106-
# "//pytorch/tokenizers:sentencepiece", # TODO(larryliu0820) Make sure this compiles in xplat.
106+
"//pytorch/tokenizers:sentencepiece",
107107
"//pytorch/tokenizers:tiktoken",
108108
],
109109
)

extension/llm/runner/text_llm_runner.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <executorch/extension/llm/runner/util.h>
1515
#include <pytorch/tokenizers/hf_tokenizer.h>
1616
#include <pytorch/tokenizers/llama2c_tokenizer.h>
17+
#include <pytorch/tokenizers/sentencepiece.h>
1718
#include <pytorch/tokenizers/tiktoken.h>
1819

1920
namespace executorch::extension::llm {
@@ -278,6 +279,12 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
278279
return tiktoken_tokenizer;
279280
}
280281

282+
auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>();
283+
if (sp_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
284+
ET_LOG(Info, "Loaded Sentencepiece tokenizer");
285+
return sp_tokenizer;
286+
}
287+
281288
auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
282289
if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
283290
ET_LOG(Info, "Loaded BPE tokenizer");

extension/llm/tokenizers

0 commit comments

Comments
 (0)