@@ -2421,7 +2421,7 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
24212421}
24222422
24232423std::vector<llama_token> llama_vocab::tokenize (
2424- std::string raw_text,
2424+ const std::string & raw_text,
24252425 bool add_special,
24262426 bool parse_special) const {
24272427 GGML_ASSERT (pimpl->tokenizer && " Tokenizer not initialized. Call llama_vocab::init_tokenizer() first." );
@@ -2452,19 +2452,21 @@ std::vector<llama_token> llama_vocab::tokenize(
24522452
24532453 for (const auto & fragment : fragment_buffer) {
24542454 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2455- auto raw_text = fragment. raw_text . substr (fragment. offset , fragment. length ) ;
2455+ std::string text ;
24562456
24572457 // prefix with space if previous is special
24582458 if (tokenizer_add_space_prefix && is_prev_special) {
2459- raw_text = " " + raw_text ;
2459+ text += ' ' ;
24602460 }
24612461
2462+ text += fragment.raw_text .substr (fragment.offset , fragment.length );
2463+
24622464#ifdef PRETOKENIZERDEBUG
2463- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2465+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
24642466#endif
2465- llama_escape_whitespace (raw_text );
2467+ llama_escape_whitespace (text );
24662468 llm_tokenizer_spm_session session (*this );
2467- session.tokenize (raw_text , output);
2469+ session.tokenize (text , output);
24682470 is_prev_special = false ;
24692471 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24702472 output.push_back (fragment.token );
@@ -2494,12 +2496,12 @@ std::vector<llama_token> llama_vocab::tokenize(
24942496 }
24952497 for (const auto & fragment : fragment_buffer) {
24962498 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2497- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2499+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
24982500
24992501#ifdef PRETOKENIZERDEBUG
2500- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2502+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
25012503#endif
2502- session.tokenize (raw_text , output);
2504+ session.tokenize (text , output);
25032505 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25042506 session.append (fragment.token , output);
25052507 }
@@ -2521,12 +2523,12 @@ std::vector<llama_token> llama_vocab::tokenize(
25212523
25222524 for (const auto & fragment : fragment_buffer) {
25232525 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2524- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2526+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
25252527
25262528#ifdef PRETOKENIZERDEBUG
2527- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2529+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
25282530#endif
2529- session.tokenize (raw_text , output);
2531+ session.tokenize (text , output);
25302532 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25312533 output.push_back (fragment.token );
25322534 }
@@ -2547,11 +2549,11 @@ std::vector<llama_token> llama_vocab::tokenize(
25472549
25482550 for (const auto & fragment : fragment_buffer) {
25492551 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2550- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2552+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
25512553#ifdef PRETOKENIZERDEBUG
2552- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2554+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
25532555#endif
2554- session.tokenize (raw_text , output);
2556+ session.tokenize (text , output);
25552557 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25562558 output.push_back (fragment.token );
25572559 }
@@ -2574,13 +2576,13 @@ std::vector<llama_token> llama_vocab::tokenize(
25742576 llm_tokenizer_rwkv_session session (*this , *static_cast <const llm_tokenizer_rwkv *>(pimpl->tokenizer .get ()));
25752577 for (const auto & fragment : fragment_buffer) {
25762578 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2577- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2579+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
25782580
25792581#ifdef PRETOKENIZERDEBUG
2580- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2582+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
25812583#endif
25822584
2583- session.tokenize (raw_text , output);
2585+ session.tokenize (text , output);
25842586 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25852587 output.push_back (fragment.token );
25862588 }
0 commit comments