Skip to content

Commit db54ea5

Browse files
ryan-mangenoCISC
andauthored
Update src/llama-vocab.cpp
Co-authored-by: Sigbjørn Skjæret <[email protected]>
1 parent 859005d commit db54ea5

File tree

1 file changed

+0
-8
lines changed

1 file changed

+0
-8
lines changed

src/llama-vocab.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -425,14 +425,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
425425
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
426426
};
427427
break;
428-
case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING:
429-
// uses digits and byte level pre tokenizers defined in the pre_tokenizer section of
430-
// https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json
431-
regex_exprs = {
432-
"[0-9]",
433-
"[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+",
434-
};
435-
break;
436428
default:
437429
// default regex for BPE tokenization pre-processing
438430
regex_exprs = {

0 commit comments

Comments
 (0)