fix(tokenizer): 实现特殊序列先匹配

YdrMaster · YdrMaster · commit 9f1384f43fc7 · 2024-08-06T11:09:57.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/service/src/lib.rs b/service/src/lib.rs
@@ -11,7 +11,7 @@ use std::{
     path::Path,
     sync::Arc,
 };
-use tokenizer::{BPECommonNormalizer, Normalizer, Tokenizer, VocabTxt, BPE};
+use tokenizer::{BPECommonNormalizer, Normalizer, Tokenize, Tokenizer, VocabTxt, BPE};
 use tokio::task::JoinHandle;
 
 pub use chat_template::Message;
@@ -29,7 +29,7 @@ pub struct Service<M: CausalLM> {
 /// 推理线程的生命周期与这个组件绑定。
 struct ServiceComponent<M: CausalLM> {
     handle: Arc<Dispatcher<M>>,
-    tokenizer: Box<dyn Tokenizer + Send + Sync>,
+    tokenizer: Box<dyn Tokenize + Send + Sync>,
     normalizer: Box<dyn Normalizer + Send + Sync>,
     template: ChatTemplate,
     bos: String,
@@ -165,10 +165,10 @@ fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync>
     panic!("Tokenizer file not found");
 }
 
-fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenizer + Send + Sync> {
+fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenize + Send + Sync> {
     use std::io::ErrorKind::NotFound;
     match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
-        Ok(bpe) => return Box::new(bpe),
+        Ok(bpe) => return Box::new(Tokenizer::new(bpe)),
         Err(e) if e.kind() == NotFound => {}
         Err(e) => panic!("{e:?}"),
     }
diff --git a/tokenizer/Cargo.toml b/tokenizer/Cargo.toml
@@ -9,3 +9,4 @@ authors = ["YdrMaster <ydrml@hotmail.com>"]
 [dependencies]
 memmap2.workspace = true
 patricia_tree = "0.8"
+regex = "1.10"
diff --git a/tokenizer/src/bpe.rs b/tokenizer/src/bpe.rs
@@ -1,4 +1,4 @@
-﻿use crate::{as_byte_token, utok, Tokenizer};
+﻿use crate::{as_byte_token, utok, Method};
 use std::{
     collections::{HashMap, HashSet},
     io,
@@ -161,7 +161,7 @@ impl BPE {
             .iter()
             .filter_map(|&t| {
                 let s = unsafe { std::str::from_utf8_unchecked(self.token(t)) };
-                if self.encode(s).len() > 1 {
+                if self.encode(s).into_iter().nth(1).is_some() {
                     Some((s, t))
                 } else {
                     None
@@ -192,22 +192,28 @@ impl BPE {
     }
 }
 
-impl Tokenizer for BPE {
+impl Method for BPE {
+    #[inline]
+    fn unk_token(&self) -> utok {
+        self.unk
+    }
     #[inline]
     fn vocab_size(&self) -> usize {
         self.tokens.len()
     }
-
     #[inline]
-    fn encode(&self, text: &str) -> Vec<utok> {
+    fn internal_special(&self) -> impl IntoIterator<Item = (&str, utok)> {
+        self.inaccessible()
+    }
+    #[inline]
+    fn encode<'a>(&'a self, text: &'a str) -> impl IntoIterator<Item = utok> + 'a {
         let mut tokenizer = self.build_tokenizer(text);
         while tokenizer.merge() {}
-        tokenizer.iter().collect()
+        tokenizer
     }
-
     #[inline]
-    fn decode(&self, token: utok) -> &str {
-        unsafe { std::str::from_utf8_unchecked(self.token(token)) }
+    fn decode(&self, token: utok) -> &[u8] {
+        self.token(token)
     }
 }
 
@@ -267,9 +273,15 @@ mod algorithm {
         merges: BinaryHeap<Merge>,
     }
 
+    pub struct IntoIter<'a> {
+        bpe: &'a BPE,
+        marks: Vec<Mark>,
+        i: usize,
+    }
+
     pub struct Iter<'a> {
         bpe: &'a BPE,
-        slice: &'a [Mark],
+        marks: &'a [Mark],
     }
 
     impl BPE {
@@ -450,7 +462,34 @@ mod algorithm {
         pub fn iter(&self) -> Iter {
             Iter {
                 bpe: self.bpe,
-                slice: &self.marks,
+                marks: &self.marks,
+            }
+        }
+    }
+
+    impl<'a> IntoIterator for BpeTokenizer<'a> {
+        type Item = utok;
+        type IntoIter = IntoIter<'a>;
+        #[inline]
+        fn into_iter(self) -> Self::IntoIter {
+            Self::IntoIter {
+                bpe: self.bpe,
+                marks: self.marks,
+                i: 0,
+            }
+        }
+    }
+
+    impl Iterator for IntoIter<'_> {
+        type Item = utok;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            match &self.marks[self.i..] {
+                &[Mark { token, .. }, ..] => {
+                    self.i += self.bpe.token(token).len();
+                    Some(token)
+                }
+                [] => None,
             }
         }
     }
@@ -459,9 +498,9 @@ mod algorithm {
         type Item = utok;
 
         fn next(&mut self) -> Option<Self::Item> {
-            match self.slice {
+            match self.marks {
                 &[Mark { token, .. }, ref tail @ ..] => {
-                    self.slice = &tail[self.bpe.token(token).len() - 1..];
+                    self.marks = &tail[self.bpe.token(token).len() - 1..];
                     Some(token)
                 }
                 [] => None,
diff --git a/tokenizer/src/lib.rs b/tokenizer/src/lib.rs
@@ -2,27 +2,38 @@
 
 mod bpe;
 mod normalizer;
+mod special;
 mod vocab_txt;
 
 /// `utok` for token id.
 #[allow(non_camel_case_types)]
 pub type utok = u32;
 
-pub trait Tokenizer {
+pub trait Tokenize {
     fn vocab_size(&self) -> usize;
     fn encode(&self, text: &str) -> Vec<utok>;
     fn decode(&self, token: utok) -> &str;
 }
 
+pub trait Method {
+    fn unk_token(&self) -> utok;
+    fn vocab_size(&self) -> usize;
+    fn internal_special(&self) -> impl IntoIterator<Item = (&str, utok)>;
+    fn encode<'a>(&'a self, text: &'a str) -> impl IntoIterator<Item = utok> + 'a;
+    fn decode(&self, token: utok) -> &[u8];
+}
+
 pub use bpe::BPE;
 pub use normalizer::{BPECommonNormalizer, Normalizer};
+pub use special::Tokenizer;
 pub use vocab_txt::VocabTxt;
 
 const fn as_byte_token(piece: &[u8]) -> Option<u8> {
     // 按结构分解并转换
     match piece {
         &[b'<', b'0', b'x', a, b, b'>'] if a.is_ascii_hexdigit() && b.is_ascii_hexdigit() => {
             // ascii 转数字
+            #[inline(always)]
             const fn to_num(c: u8) -> u8 {
                 match c {
                     b'0'..=b'9' => c - b'0',
diff --git a/tokenizer/src/special.rs b/tokenizer/src/special.rs
@@ -0,0 +1,96 @@
+﻿use crate::{utok, Method};
+use regex::Regex;
+use std::collections::HashMap;
+
+pub struct Tokenizer<M> {
+    method: M,
+    special: HashMap<String, Vec<utok>>,
+    special_regex: regex::Regex,
+}
+
+impl<M: Method> Tokenizer<M> {
+    pub fn new(method: M) -> Self {
+        let special = method
+            .internal_special()
+            .into_iter()
+            .map(|(k, v)| (k.to_string(), vec![v]))
+            .collect::<HashMap<_, _>>();
+        let special_regex = build_pattern(special.keys());
+        Self {
+            method,
+            special,
+            special_regex,
+        }
+    }
+
+    pub fn extend_special(&mut self, patterns: impl IntoIterator<Item = (String, Vec<utok>)>) {
+        use std::collections::hash_map::Entry::{Occupied, Vacant};
+        let mut any = false;
+        for (k, v) in patterns {
+            match self.special.entry(k) {
+                Occupied(entry) => {
+                    assert_eq!(entry.get(), &v);
+                }
+                Vacant(entry) => {
+                    entry.insert(v);
+                    any = true;
+                }
+            }
+        }
+        if any {
+            self.special_regex = build_pattern(self.special.keys());
+        }
+    }
+
+    pub fn encode(&self, text: &str) -> Vec<utok> {
+        let mut ans = Vec::new();
+        let mut start = 0;
+        for m in self.special_regex.find_iter(text) {
+            ans.extend(self.method.encode(&text[start..m.start()]));
+            ans.extend_from_slice(&self.special[m.as_str()]);
+            start = m.end();
+        }
+        ans.extend(self.method.encode(&text[start..]));
+        ans
+    }
+
+    pub fn decode(&self, tokens: &[utok]) -> String {
+        let mut ans = Vec::new();
+        for &t in tokens {
+            ans.extend_from_slice(self.method.decode(t));
+        }
+        String::from_utf8(ans).unwrap()
+    }
+
+    pub fn internal(&self) -> &M {
+        &self.method
+    }
+}
+
+fn build_pattern<'a, T: AsRef<str>>(text: impl IntoIterator<Item = T>) -> Regex {
+    let mut pattern = String::new();
+    let mut iter = text.into_iter();
+    if let Some(p) = iter.next() {
+        pattern.push_str(p.as_ref());
+    }
+    for p in iter {
+        pattern.push('|');
+        pattern.push_str(p.as_ref());
+    }
+    regex::Regex::new(&pattern).unwrap()
+}
+
+impl crate::Tokenize for Tokenizer<crate::BPE> {
+    #[inline]
+    fn vocab_size(&self) -> usize {
+        self.method.vocab_size()
+    }
+    #[inline]
+    fn encode(&self, text: &str) -> Vec<utok> {
+        self.encode(text)
+    }
+    #[inline]
+    fn decode(&self, token: utok) -> &str {
+        unsafe { std::str::from_utf8_unchecked(self.method.decode(token)) }
+    }
+}
diff --git a/tokenizer/src/vocab_txt.rs b/tokenizer/src/vocab_txt.rs
@@ -1,4 +1,4 @@
-﻿use crate::{decode_with_ascii, utok, Tokenizer};
+﻿use crate::{decode_with_ascii, utok, Tokenize};
 use memmap2::Mmap;
 use patricia_tree::PatriciaMap;
 use std::{fs::File, io::Result, path::Path};
@@ -28,7 +28,7 @@ impl VocabTxt {
     }
 }
 
-impl Tokenizer for VocabTxt {
+impl Tokenize for VocabTxt {
     fn vocab_size(&self) -> usize {
         self.words.len()
     }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-use crate::{decode_with_ascii, utok, Tokenizer};`
	`1`	`+use crate::{decode_with_ascii, utok, Tokenize};`
`2`	`2`	`use memmap2::Mmap;`
`3`	`3`	`use patricia_tree::PatriciaMap;`
`4`	`4`	`use std::{fs::File, io::Result, path::Path};`
`@@ -28,7 +28,7 @@ impl VocabTxt {`
`28`	`28`	`}`
`29`	`29`	`}`
`30`	`30`
`31`		`-impl Tokenizer for VocabTxt {`
	`31`	`+impl Tokenize for VocabTxt {`
`32`	`32`	`fn vocab_size(&self) -> usize {`
`33`	`33`	`self.words.len()`
`34`	`34`	`}`