diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp index 1c70d52c..518019e1 100644 --- a/examples/demo_qwen.cpp +++ b/examples/demo_qwen.cpp @@ -18,10 +18,12 @@ int main(int argc, char **argv) { std::iostream::sync_with_stdio(false); cmdline::parser cmdParser; + cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm"); cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt"); cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2.5-3b-instruct-q4_0_4x4.mllm"); cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B | 3B |]", false, "3B"); + cmdParser.add("version", 'r', "[Qwen1.5 | Qwen2.5 |]", false, "Qwen2.5"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -30,11 +32,12 @@ int main(int argc, char **argv) { string merge_path = cmdParser.get("merge"); string model_path = cmdParser.get("model"); string model_billion = cmdParser.get("billion"); + string model_version = cmdParser.get("version"); int tokens_limit = cmdParser.get("limits"); CPUBackend::cpu_threads = cmdParser.get("thread"); auto tokenizer = QWenTokenizer(vocab_path, merge_path); - QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE); + QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE, model_version); auto model = QWenForCausalLM(config); model.load(model_path); @@ -51,10 +54,10 @@ int main(int argc, char **argv) { LlmTextGeneratorOpts opt{ .max_new_tokens = 100, - .do_sample = true, - .temperature = 0.3F, - .top_k = 50, - .top_p = 0.F, + .do_sample = false, + //.temperature = 0.7F, + //.top_k = 20, + //.top_p = 0.8F, }; model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool { auto out_string = tokenizer.detokenize({out_token}); diff --git a/src/models/qwen/configuration_qwen.hpp b/src/models/qwen/configuration_qwen.hpp index 44698438..b9cc719e 100644 --- a/src/models/qwen/configuration_qwen.hpp +++ b/src/models/qwen/configuration_qwen.hpp @@ -77,13 +77,17 @@ class QWenNameConfig : public TransformerNameConfig { }; struct QWenConfig : public TransformerConfig { - explicit QWenConfig(int token_limit, string billions = "0.5B", RoPEType type = RoPEType::HFHUBROPE) : + explicit QWenConfig(int token_limit, string billions = "0.5B", RoPEType type = RoPEType::HFHUBROPE, string model_version = "Qwen1.5") : cache_limit(token_limit) { names_config.init(type); string billionsType; std::transform(billions.begin(), billions.end(), std::back_inserter(billionsType), ::tolower); - if (billionsType == "0.5b") { + string modelVersion; + std::transform(model_version.begin(), model_version.end(), std::back_inserter(modelVersion), + ::tolower); + + if (billionsType == "0.5b" && modelVersion == "qwen1.5") { attention_dropout = 0.0; bos_token_id = 151643; eos_token_id = 151645; @@ -102,6 +106,25 @@ struct QWenConfig : public TransformerConfig { sliding_window = 32768; vocab_size = 151936; tie_embedding_words = true; + } else if (billionsType == "0.5b" && modelVersion == "qwen2.5") { + attention_dropout = 0.0; + bos_token_id = 151643; + eos_token_id = 151645; + std::string hidden_act = "silu"; + hidden_size = 896; + initializer_range = 0.02; + intermediate_size = 4864; + max_position_embeddings = 32768; + max_window_layers = 21; + model_type = "qwen2"; + num_attention_heads = 14; + num_hidden_layers = 24; + num_key_value_heads = 2; + rms_norm_eps = 1e-6; + rope_theta = 1000000.0; + sliding_window = 32768; + vocab_size = 151936; + tie_embedding_words = true; } else if (billionsType == "1.8b") { attention_dropout = 0.0; std::string hidden_act = "silu"; @@ -116,7 +139,7 @@ struct QWenConfig : public TransformerConfig { sliding_window = 32768; vocab_size = 151936; tie_embedding_words = false; - } else if (billionsType == "1.5b") { + } else if (billionsType == "1.5b" && modelVersion == "qwen2.5") { attention_dropout = 0.0; std::string hidden_act = "silu"; hidden_size = 1536;