@@ -284,6 +284,27 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
284284"""BERT tokenizer based on WordPiece"""
285285
286286
287+ # // pre-tokenization types
288+ # enum llama_vocab_pre_type {
289+ # LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
290+ # LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
291+ # LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
292+ # LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
293+ # LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
294+ # LLAMA_VOCAB_PRE_TYPE_MPT = 5,
295+ # LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
296+ # LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
297+ # };
298+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
299+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
300+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2
301+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3
302+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
303+ LLAMA_VOCAB_PRE_TYPE_MPT = 5
304+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
305+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
306+
307+
287308# // note: these values should be synchronized with ggml_rope
288309# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
289310# enum llama_rope_type {
0 commit comments