Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added models/ggml-vocab-ling-plus.gguf
Binary file not shown.
112 changes: 112 additions & 0 deletions models/ggml-vocab-ling-plus.gguf.inp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__

__ggml_vocab_test__


__ggml_vocab_test__



__ggml_vocab_test__




__ggml_vocab_test__


__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__

=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
__ggml_vocab_test__
!!!!!!
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
Cửa Việt
__ggml_vocab_test__
discards
__ggml_vocab_test__











🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__
46 changes: 46 additions & 0 deletions models/ggml-vocab-ling-plus.gguf.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
1368 220 19 220 24452 3710
37 78549 21767

220
256
305
197
198
198 198
198 198 198
197 198
14455 1931
34994 1931
14455 4832
34994 4832
34994 4832 0
14455 11 1931 0
34994 11 1931 0
501 341 8811 99 247 13 37710
86 15 19 23 220 22 83 5322 43067 27647 5599 18572
107171 31758 3186 23679 109404 31994 8108 23947 31216 26804 3875
34387 222 34387 114 34387 241 80203 233 34387 237 80203 224 34387 244 34387 115 34387 253 80203 223 34387 253 34387 95 34387 114 34387 227 34387 223 34387 249 34387 227 80203 223 34387 231
114055 222 363 12415 8 21803 114 56848 75978 104 25661 363 64398 1098 115815 53600 659 8 44358 227 363 7619 86273 378 723 1097 1645 9775 8
14455
34994
220 34994
256 34994
305 34994
305 34994 198 305 34994
363
198 373
6 16206
14455 11 340 88386 0 2071 449 362 21803 223 3543 17175 401 32164 1557 16 18 16 19 16 20 16 820 7506
15421 4021
18
18 18
18 18 18
18 18 18 18
18 18 18 18 18
18 18 18 18 18 18
18 18 18 18 18 18 18
18 18 18 18 18 18 18 18
18 18 18 18 18 18 18 18 18
34 17136 255 64 34335 82161 83
2133 3082
198 220 198 198 220 198 198 198 220 197 220 456 220 197 198 256 198 305 198 263 198 798 198 114055 222 363 12415 8 21803 114 56848 75978 104 25661 363 64398 1098 115815 53600 659 8 44358 227 8811 99 247 5901 99 247 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 491 18 220 18 1152 18 220 34387 222 34387 114 34387 241 80203 233 34387 237 80203 224 34387 244 34387 115 34387 253 80203 223 34387 253 34387 95 34387 114 34387 227 44542 223 3543 17175 401 32164 1557 16 18 16 19 16 20 16 820 7506 93668 1315 25898 41979 31758 3186 23679 109404 31994 8108 23947 31216 26804 3875 36809 54629 44470 2891 7980 7980 13252 15421 4021 34600 10303 331 3868 1007 689 119947 464 698 947 11 689 1364 362 2607 30 689 44 526 2607 331 4164 1253 403 11 689 35 362 1029 971 13197 30 1104 6 54377 259 82191 43
3 changes: 2 additions & 1 deletion src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
regex_exprs = {
// original regex from tokenizer.json
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
default:
Expand Down
Loading