Skip to content

Commit 19ace0b

Browse files
GuyGoldenbergggerganov
authored andcommitted
Merge commit from fork
* vocab : prevent integer overflow during load * Add static cast and GGML_ABORT --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1468275 commit 19ace0b

File tree

1 file changed

+8
-31
lines changed

1 file changed

+8
-31
lines changed

src/llama-vocab.cpp

Lines changed: 8 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,17 @@
99

1010
#include <algorithm>
1111
#include <cassert>
12-
#include <cctype>
1312
#include <cfloat>
13+
#include <climits>
1414
#include <cstdarg>
1515
#include <cstring>
1616
#include <forward_list>
17-
#include <limits>
1817
#include <map>
1918
#include <queue>
2019
#include <set>
2120
#include <unordered_map>
21+
#include <cctype>
22+
#include <cinttypes>
2223

2324
//
2425
// helpers
@@ -1269,7 +1270,6 @@ struct llama_vocab::impl {
12691270
bool add_space_prefix = false;
12701271
bool add_bos = false;
12711272
bool add_eos = false;
1272-
bool add_sep = false;
12731273
bool ignore_merges = false;
12741274
bool clean_spaces = false; // clean_up_tokenization_spaces
12751275
bool remove_extra_whitespaces = false;
@@ -1422,8 +1422,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
14221422
special_sep_id = 102;
14231423
special_pad_id = 0;
14241424
special_mask_id = 103;
1425-
1426-
add_sep = true;
14271425
} else if (tokenizer_model == "gpt2") {
14281426
type = LLAMA_VOCAB_TYPE_BPE;
14291427

@@ -1553,15 +1551,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
15531551
tokenizer_pre == "jina-es" ||
15541552
tokenizer_pre == "jina-de" ||
15551553
tokenizer_pre == "gigachat" ||
1556-
tokenizer_pre == "jina-v2-es" ||
1557-
tokenizer_pre == "jina-v2-de") {
1558-
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559-
} else if (
15601554
tokenizer_pre == "jina-v1-en" ||
1555+
tokenizer_pre == "jina-v2-es" ||
1556+
tokenizer_pre == "jina-v2-de" ||
15611557
tokenizer_pre == "jina-v2-code" ||
15621558
tokenizer_pre == "roberta-bpe") {
15631559
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1564-
add_sep = true;
15651560
} else if (
15661561
tokenizer_pre == "refact") {
15671562
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1671,7 +1666,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16711666
clean_spaces = true;
16721667
add_bos = true;
16731668
add_eos = false;
1674-
add_sep = true;
16751669
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
16761670
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
16771671
add_bos = false;
@@ -1808,7 +1802,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
18081802
}
18091803
}
18101804

1811-
// Handle add_bos, add_eos and add_sep
1805+
// Handle add_bos and add_eos
18121806
{
18131807
bool temp = true;
18141808

@@ -1818,9 +1812,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
18181812
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
18191813
add_eos = temp;
18201814
}
1821-
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
1822-
add_sep = temp;
1823-
}
18241815
}
18251816

18261817
// auto-detect special tokens by text
@@ -1997,7 +1988,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19971988
|| t.first == "<|eom_id|>"
19981989
|| t.first == "<EOT>"
19991990
|| t.first == "_<EOT>"
2000-
|| t.first == "<|end_of_text|>"
20011991
) {
20021992
special_eog_ids.insert(t.second);
20031993
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2070,9 +2060,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20702060
//NOTE: Per token attributes are missing from the GGUF file.
20712061
//TODO: Extract attributes from GGUF file.
20722062
{
2073-
auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2063+
auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
20742064
for (const auto & substr : substrs) {
2075-
if (str.find(substr) != std::string::npos) {
2065+
if (str.find(substr) < std::string::npos) {
20762066
return true;
20772067
}
20782068
}
@@ -3010,10 +3000,6 @@ bool llama_vocab::get_add_eos() const {
30103000
return pimpl->add_eos;
30113001
}
30123002

3013-
bool llama_vocab::get_add_sep() const {
3014-
return pimpl->add_sep;
3015-
}
3016-
30173003
bool llama_vocab::get_ignore_merges() const {
30183004
return pimpl->ignore_merges;
30193005
}
@@ -3074,11 +3060,6 @@ int32_t llama_vocab::tokenize(
30743060
bool add_special,
30753061
bool parse_special) const {
30763062
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3077-
if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3078-
LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3079-
return std::numeric_limits<int32_t>::min();
3080-
}
3081-
30823063
if (n_tokens_max < (int) res.size()) {
30833064
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
30843065
return -((int) res.size());
@@ -3210,10 +3191,6 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
32103191
return vocab->get_add_eos();
32113192
}
32123193

3213-
bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3214-
return vocab->get_add_sep();
3215-
}
3216-
32173194
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
32183195
return vocab->token_fim_pre();
32193196
}

0 commit comments

Comments
 (0)