Skip to content

Commit d087f74

Browse files
committed
vocab : INT32_MIN from llama_tokenize on overflow
1 parent f5972a1 commit d087f74

File tree

3 files changed

+5
-1
lines changed

3 files changed

+5
-1
lines changed

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,6 +1284,9 @@ std::vector<llama_token> common_tokenize(
12841284
int n_tokens = text.length() + 2 * add_special;
12851285
std::vector<llama_token> result(n_tokens);
12861286
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1287+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1288+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1289+
}
12871290
if (n_tokens < 0) {
12881291
result.resize(-n_tokens);
12891292
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,7 @@ extern "C" {
10871087
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
10881088
/// @return Returns the number of tokens on success, no more than n_tokens_max
10891089
/// @return Returns a negative number on failure - the number of tokens that would have been returned
1090+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
10901091
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
10911092
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
10921093
/// as plaintext. Does not insert a leading space.

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3062,7 +3062,7 @@ int32_t llama_vocab::tokenize(
30623062
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
30633063
if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
30643064
LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3065-
return -1;
3065+
return std::numeric_limits<int32_t>::min();
30663066
}
30673067

30683068
if (n_tokens_max < (int) res.size()) {

0 commit comments

Comments
 (0)