llama : support Llama 3 HF conversion (ggml-org#6745)

pcuenca · pacman100 · ggerganov · web-flow · commit b97bc3966e85 · 2024-04-21T14:50:41.000+03:00
* Support Llama 3 conversion

The tokenizer is BPE.

* style

* Accept suggestion

Co-authored-by: Sourab Mangrulkar &lt;13534540+pacman100@users.noreply.github.com&gt;

* llama : add llama_token_is_eog()

ggml-ci

* llama : auto-detect more EOT tokens when missing in KV data

* convert : replacing EOS token is a hack

* llama : fix codegemma EOT token + add TODOs

* llama : fix model type string for 8B model

---------

Co-authored-by: Sourab Mangrulkar &lt;13534540+pacman100@users.noreply.github.com&gt;
Co-authored-by: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -1301,15 +1301,23 @@ def set_vocab(self):
         try:
             self. _set_vocab_sentencepiece()
         except FileNotFoundError:
-            self._set_vocab_llama_hf()
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
-                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
-        special_vocab._set_special_token("prefix", 32007)
-        special_vocab._set_special_token("suffix", 32008)
-        special_vocab._set_special_token("middle", 32009)
-        special_vocab._set_special_token("eot",    32010)
-        special_vocab.add_to_gguf(self.gguf_writer)
+            try:
+                self._set_vocab_llama_hf()
+            except (FileNotFoundError, TypeError):
+                # Llama 3
+                self._set_vocab_gpt2()
+
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        if self.hparams.get("vocab_size", 32000) == 32016:
+            special_vocab = gguf.SpecialVocab(
+                self.dir_model, load_merges=False,
+                special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+            )
+            special_vocab._set_special_token("prefix", 32007)
+            special_vocab._set_special_token("suffix", 32008)
+            special_vocab._set_special_token("middle", 32009)
+            special_vocab._set_special_token("eot",    32010)
+            special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -2194,6 +2202,8 @@ def set_vocab(self):
         old_eos = special_vocab.special_token_ids["eos"]
         if "chat" in os.path.basename(self.dir_model.absolute()):
             # For the chat model, we replace the eos with '<|im_end|>'.
+            # TODO: this is a hack, should be fixed
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
             special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
             print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
 in chat mode so that the conversation can end normally.")
@@ -2429,12 +2439,15 @@ class GemmaModel(Model):
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
+
+        # TODO: these special tokens should be exported only for the CodeGemma family
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
-                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
         special_vocab._set_special_token("prefix", 67)
         special_vocab._set_special_token("suffix", 69)
         special_vocab._set_special_token("middle", 68)
-        special_vocab._set_special_token("eot",    70)
+        special_vocab._set_special_token("fsep",   70)
+        special_vocab._set_special_token("eot",    107)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -2523,28 +2536,34 @@ def set_vocab(self):
 
             field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
             self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
             self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
             self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
             self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
             self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
             self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+
             field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
             self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
 
     def set_gguf_parameters(self):
-        d_model = self.find_hparam(["hidden_size", "d_model"])
-        d_conv  = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+        d_model = self.find_hparam(["hidden_size",       "d_model"])
+        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
         d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
-        d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 16
         # ceiling division
         # ref: https://stackoverflow.com/a/17511341/22827863
         # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
-        dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
         rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
 
         # Fail early for models which don't have a block expansion factor of 2
diff --git a/convert.py b/convert.py
@@ -525,7 +525,14 @@ def __init__(self, base_path: Path):
 
         # pre-check so we know if we need transformers
         tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if (
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
             tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
             or tokenizer_json['decoder']['type'] != 'Sequence'
         ):
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
         // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
         // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             i_batch[i] = -1
             // print("")
             if n_parallel > 1 {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
 
             //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
-            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation? -> mark the stream as finished
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                 i_batch[i] = -1;
                 LOG_TEE("\n");
                 if (n_parallel > 1) {
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
 }
 
 // Function matching type llama_beam_search_callback_fn_t.
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
@@ -586,7 +586,7 @@ int main(int argc, char ** argv) {
 
             // deal with eot token in infill mode
             if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if(is_interacting && !params.interactive_first) {
+                if (is_interacting && !params.interactive_first) {
                     // print an eot token
                     printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                 }
@@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
                 // LOG_TEE("took new input\n");
                 is_interacting = false;
             }
-            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
             }
         }
 
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
             break;
         }
 
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
     const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
         return env->NewStringUTF("");
     }
 
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
             new_token_id = llama_sample_token_greedy(context, &candidates_p)
         }
 
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
             let new_token_str = String(cString: temporary_invalid_cchars + [0])
             temporary_invalid_cchars.removeAll()
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -45,7 +45,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
     const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
     llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
     static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
         ret = "</s>";
     } else {
         ret = llama_token_to_piece(ctx_llama, id);
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
                 }
                 fflush(stdout);
 
-                if (id == llama_token_eos(model)) {
+                if (llama_token_is_eog(model, id)) {
                     has_eos = true;
                 }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -141,7 +141,7 @@ int main(int argc, char ** argv){
                 printf("%s", token_str.c_str());
             }
 
-            if (id == llama_token_eos(model)) {
+            if (llama_token_is_eog(model, id)) {
                 has_eos = true;
             }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -795,8 +795,8 @@ int main(int argc, char ** argv) {
                 }
             }
 
-            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -920,8 +920,8 @@ int main(int argc, char ** argv) {
             }
         }
 
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
             LOG_TEE(" [end of text]\n");
             break;
         }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(model) ||
+                        (llama_token_is_eog(model, id) ||
                          (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
             // sample the most likely token
             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                 LOG_TEE("\n");
 
                 break;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1201,7 +1201,7 @@ struct server_context {
             });
         }
 
-        if (result.tok == llama_token_eos(model)) {
+        if (llama_token_is_eog(model, result.tok)) {
             slot.stopped_eos    = true;
             slot.has_next_token = false;
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -381,10 +381,6 @@ static json oaicompat_completion_params_parse(
     } else {
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
-    // Some chat templates don't use EOS token to stop generation
-    // We must add their end sequences to list of stop words
-    llama_params["stop"].push_back("<|im_end|>"); // chatml
-    llama_params["stop"].push_back("<end_of_turn>"); // gemma
 
     // Handle "response_format" field
     if (body.contains("response_format")) {
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -133,8 +133,8 @@ int main(int argc, char ** argv) {
             // sample the most likely token
             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                 LOG_TEE("\n");
 
                 break;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -360,7 +360,7 @@ int main(int argc, char ** argv) {
                     }
                 }
 
-                if (token_id == llama_token_eos(model_tgt)) {
+                if (llama_token_is_eog(model_tgt, token_id)) {
                     has_eos = true;
                 }
                 ++n_predict;
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ struct beam_search_callback_data {`
`47`	`47`	`// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.`
`48`	`48`	`// For example, eob can be flagged due to maximum token length, stop words, etc.`
`49`	`49`	`static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {`
`50`		`- return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));`
	`50`	`+ return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`// Function matching type llama_beam_search_callback_fn_t.`
Original file line number	Diff line number	Diff line change
`@@ -586,7 +586,7 @@ int main(int argc, char ** argv) {`
`586`	`586`
`587`	`587`	`// deal with eot token in infill mode`
`588`	`588`	`if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) \|\| is_interacting) && params.interactive){`
`589`		`- if(is_interacting && !params.interactive_first) {`
	`589`	`+ if (is_interacting && !params.interactive_first) {`
`590`	`590`	`// print an eot token`
`591`	`591`	`printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());`
`592`	`592`	`}`
`@@ -651,8 +651,8 @@ int main(int argc, char ** argv) {`
`651`	`651`	`// LOG_TEE("took new input\n");`
`652`	`652`	`is_interacting = false;`
`653`	`653`	`}`
`654`		`- // deal with end of text token in interactive mode`
`655`		`- else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {`
	`654`	`+ // deal with end of generation tokens in interactive mode`
	`655`	`+ else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {`
`656`	`656`	`LOG("found EOS token\n");`
`657`	`657`
`658`	`658`	`if (params.interactive) {`
`@@ -731,8 +731,8 @@ int main(int argc, char ** argv) {`
`731`	`731`	`}`
`732`	`732`	`}`
`733`	`733`
`734`		`- // end of text token`
`735`		`- if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {`
	`734`	`+ // end of generation`
	`735`	`+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {`
`736`	`736`	`break;`
`737`	`737`	`}`
`738`	`738`
Original file line number	Diff line number	Diff line change
`@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(`
`408`	`408`	`const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);`
`409`	`409`
`410`	`410`	`const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);`
`411`		`- if (new_token_id == llama_token_eos(model) \|\| n_cur == n_len) {`
	`411`	`+ if (llama_token_is_eog(model, new_token_id) \|\| n_cur == n_len) {`
`412`	`412`	`return env->NewStringUTF("");`
`413`	`413`	`}`
`414`	`414`
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ actor LlamaContext {`
`158`	`158`	`new_token_id = llama_sample_token_greedy(context, &candidates_p)`
`159`	`159`	`}`
`160`	`160`
`161`		`- if new_token_id == llama_token_eos(model) \|\| n_cur == n_len {`
	`161`	`+ if llama_token_is_eog(model, new_token_id) \|\| n_cur == n_len {`
`162`	`162`	`print("\n")`
`163`	`163`	`let new_token_str = String(cString: temporary_invalid_cchars + [0])`
`164`	`164`	`temporary_invalid_cchars.removeAll()`
Original file line number	Diff line number	Diff line change
`@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {`
`299`	`299`	`}`
`300`	`300`	`fflush(stdout);`
`301`	`301`
`302`		`- if (id == llama_token_eos(model)) {`
	`302`	`+ if (llama_token_is_eog(model, id)) {`
`303`	`303`	`has_eos = true;`
`304`	`304`	`}`
`305`	`305`