Skip to content

Commit 3ef0128

Browse files
committed
refactor(tokenizer): 使用独立仓库中的 bpe tokenizer
Signed-off-by: YdrMaster <[email protected]>
1 parent 9f1384f commit 3ef0128

File tree

7 files changed

+37
-678
lines changed

7 files changed

+37
-678
lines changed

Cargo.lock

Lines changed: 12 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

service/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ causal-lm = { path = "../causal-lm" }
1414
chat-template = { path = "../chat-template" }
1515
log.workspace = true
1616
tokio.workspace = true
17+
memmap2.workspace = true
1718
lru = "0.12"
1819
rangemap = "1.5"
1920

service/src/lib.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ use chat_template::ChatTemplate;
88
use session::{Dispatcher, Generator};
99
use std::{
1010
fmt::{self, Debug},
11+
fs::File,
1112
path::Path,
1213
sync::Arc,
1314
};
14-
use tokenizer::{BPECommonNormalizer, Normalizer, Tokenize, Tokenizer, VocabTxt, BPE};
15+
use tokenizer::{BPECommonNormalizer, Bpe, Normalizer, Tokeneer, Tokenize, VocabTxt};
1516
use tokio::task::JoinHandle;
1617

1718
pub use chat_template::Message;
@@ -152,10 +153,8 @@ fn template(model_dir: impl AsRef<Path>) -> ChatTemplate {
152153

153154
fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync> {
154155
use std::io::ErrorKind::NotFound;
155-
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
156-
Ok(_) => return Box::new(BPECommonNormalizer {}),
157-
Err(e) if e.kind() == NotFound => {}
158-
Err(e) => panic!("{e:?}"),
156+
if model_dir.as_ref().join("tokenizer.model").is_file() {
157+
return Box::new(BPECommonNormalizer {});
159158
}
160159
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
161160
Ok(_) => return Box::new(()),
@@ -167,8 +166,10 @@ fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync>
167166

168167
fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenize + Send + Sync> {
169168
use std::io::ErrorKind::NotFound;
170-
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
171-
Ok(bpe) => return Box::new(Tokenizer::new(bpe)),
169+
let file = File::open(model_dir.as_ref().join("tokenizer.model"))
170+
.and_then(|f| unsafe { memmap2::Mmap::map(&f) });
171+
match file {
172+
Ok(f) => return Box::new(Tokeneer::new(Bpe::from_tokenizer_model(&f))),
172173
Err(e) if e.kind() == NotFound => {}
173174
Err(e) => panic!("{e:?}"),
174175
}

tokenizer/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ authors = ["YdrMaster <[email protected]>"]
99
[dependencies]
1010
memmap2.workspace = true
1111
patricia_tree = "0.8"
12-
regex = "1.10"
12+
tokeneer = { git = "https://github.com/YdrMaster/tokeneer", rev = "8a9c60d" }

0 commit comments

Comments
 (0)