We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 859005d commit db54ea5Copy full SHA for db54ea5
src/llama-vocab.cpp
@@ -425,14 +425,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
425
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
426
};
427
break;
428
- case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING:
429
- // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of
430
- // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json
431
- regex_exprs = {
432
- "[0-9]",
433
- "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+",
434
- };
435
- break;
436
default:
437
// default regex for BPE tokenization pre-processing
438
regex_exprs = {
0 commit comments