Skip to content

Commit 8090e75

Browse files
committed
llama : vocabl private charsmap
ggml-ci
1 parent 6abe822 commit 8090e75

File tree

2 files changed

+13
-13
lines changed

2 files changed

+13
-13
lines changed

src/llama-vocab.cpp

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -707,28 +707,28 @@ struct llm_tokenizer_wpm_session {
707707
//
708708

709709
struct llm_tokenizer_ugm : llm_tokenizer {
710-
llm_tokenizer_ugm(const llama_vocab & vocab) {
711-
if (vocab.precompiled_charsmap.size() > 0) {
710+
llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
711+
if (precompiled_charsmap.size() > 0) {
712712
size_t charsmap_offset = 0;
713713

714714
// First four bytes of precompiled_charsmap contains length of binary
715715
// blob containing XOR-compressed compact double array (XCDA) entries
716-
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
716+
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
717717
charsmap_offset += sizeof(xcda_blob_size);
718-
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
718+
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
719719
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
720720
}
721721

722722
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
723723
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
724-
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
724+
xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
725725
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
726726
charsmap_offset += xcda_blob_size;
727727

728728
// Remaining bytes of precompiled charsmap contain null-terminated
729729
// replacement strings for prefixes matched by the XCDA.
730-
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
731-
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
730+
prefix_replacements = &precompiled_charsmap[charsmap_offset];
731+
prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
732732
}
733733

734734
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
@@ -1169,6 +1169,8 @@ struct llm_tokenizer_rwkv_session {
11691169
struct llama_vocab::impl {
11701170
std::unique_ptr<llm_tokenizer> tokenizer;
11711171

1172+
std::vector<char> precompiled_charsmap;
1173+
11721174
impl(const llama_vocab & vocab) : vocab(vocab) {
11731175
}
11741176

@@ -1195,7 +1197,7 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
11951197
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
11961198
break;
11971199
case LLAMA_VOCAB_TYPE_UGM:
1198-
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab);
1200+
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
11991201
break;
12001202
case LLAMA_VOCAB_TYPE_RWKV:
12011203
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
@@ -1334,14 +1336,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
13341336
if (precompiled_charsmap_keyidx != -1) {
13351337
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
13361338
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1337-
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1339+
pimpl->precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
13381340
#ifdef IS_BIG_ENDIAN
13391341
// correct endiannes of data in precompiled_charsmap binary blob
1340-
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1342+
uint32_t * xcda_blob_size = (uint32_t *) &pimpl->precompiled_charsmap[0];
13411343
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
13421344
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
13431345
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1344-
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1346+
uint32_t * xcda_array = (uint32_t *) &pimpl->precompiled_charsmap[sizeof(uint32_t)];
13451347
for (size_t i = 0; i < xcda_array_size; ++i) {
13461348
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
13471349
}

src/llama-vocab.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ struct llama_vocab {
7373
bool tokenizer_escape_whitespaces = true;
7474
bool tokenizer_treat_whitespace_as_suffix = false;
7575

76-
std::vector<char> precompiled_charsmap;
77-
7876
llama_vocab();
7977
~llama_vocab();
8078

0 commit comments

Comments
 (0)