@@ -1322,11 +1322,13 @@ struct llama_vocab::impl {
13221322                         char  * text,
13231323                      int32_t    text_len_max,
13241324                         bool    remove_special,
1325-                          bool    unparse_special) const ;
1325+                          bool    unparse_special,
1326+                          bool    remove_space_prefix = true ) const ;
13261327
13271328    std::string detokenize (
13281329            const  std::vector<llama_token> & tokens,
1329-                                       bool    special) const ;
1330+                                       bool    special,
1331+                                       bool    remove_space_prefix = true ) const ;
13301332
13311333    void  print_info () const ;
13321334
@@ -2581,7 +2583,8 @@ int32_t llama_vocab::impl::detokenize(
25812583                            char  * text,
25822584                         int32_t    text_len_max,
25832585                            bool    remove_special,
2584-                             bool    unparse_special) const  {
2586+                             bool    unparse_special,
2587+                             bool    remove_space_prefix) const  {
25852588    if  (type == LLAMA_VOCAB_TYPE_NONE) {
25862589        return  0 ;
25872590    }
@@ -2592,7 +2595,7 @@ int32_t llama_vocab::impl::detokenize(
25922595    int32_t  total = 0 ;
25932596
25942597    //  remove the leading space
2595-     bool  remove_space = add_space_prefix;
2598+     bool  remove_space = add_space_prefix && remove_space_prefix ;
25962599
25972600    if  (remove_special && add_bos) {
25982601        if  (n_tokens > 0  && tokens[0 ] == special_bos_id) {
@@ -2991,17 +2994,18 @@ int32_t llama_vocab::detokenize(
29912994                            char  * text,
29922995                         int32_t    text_len_max,
29932996                            bool    remove_special,
2994-                             bool    unparse_special) const  {
2995-     return  pimpl->detokenize (tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
2997+                             bool    unparse_special,
2998+                             bool    remove_space_prefix) const  {
2999+     return  pimpl->detokenize (tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
29963000}
29973001
2998- std::string llama_vocab::detokenize (const  std::vector<llama_token> & tokens, bool  special) const  {
3002+ std::string llama_vocab::detokenize (const  std::vector<llama_token> & tokens, bool  special,  bool  remove_space_prefix ) const  {
29993003    std::string text;
30003004    text.resize (std::max (text.capacity (), tokens.size ()));
3001-     int32_t  n_chars = detokenize (tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
3005+     int32_t  n_chars = detokenize (tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special, remove_space_prefix );
30023006    if  (n_chars < 0 ) {
30033007        text.resize (-n_chars);
3004-         n_chars = detokenize (tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
3008+         n_chars = detokenize (tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special, remove_space_prefix );
30053009        GGML_ASSERT (n_chars <= (int32_t )text.size ());  //  whitespace trimming is performed after per-token detokenization
30063010    }
30073011
@@ -3246,7 +3250,8 @@ int32_t llama_detokenize(
32463250                        char  * text,
32473251                     int32_t    text_len_max,
32483252                        bool    remove_special,
3249-                         bool    unparse_special) {
3250-     return  vocab->detokenize (tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3253+                         bool    unparse_special,
3254+                         bool    remove_space_prefix) {
3255+     return  vocab->detokenize (tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
32513256}
32523257
0 commit comments