Skip to content

Commit 3968c5a

Browse files
committed
Update regex per ngxson
1 parent 1837951 commit 3968c5a

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
396396
// original regex from tokenizer.json
397397
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
398398
regex_exprs = {
399-
"[^\\r\\nA-Za-z0-9]?[A-Z]*[a-z]+(?:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\nA-Za-z0-9]?[A-Z]+[a-z]*(?:'s|'t|'re|'ve|'m|'ll|'d)?|[0-9]{1,3}| ?[^\\sA-Za-z0-9]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
399+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
400400
};
401401
break;
402402
default:

0 commit comments

Comments
 (0)