li-plus
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 53 additions & 1 deletion b/‎README.md‎
Lines changed: 53 additions & 1 deletion
diff --git a/‎chatglm.cpp‎
Lines changed: 190 additions & 0 deletions b/‎chatglm.cpp‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎chatglm.h‎
Lines changed: 64 additions & 0 deletions b/‎chatglm.h‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎chatglm_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎chatglm_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -16,3 +16,6 @@ build/
 
 # model
 *.bin
+
+# clangd
+.cache/
@@ -21,7 +21,7 @@ Highlights:
 Support Matrix:
 * Hardwares: x86/arm CPU, NVIDIA GPU, Apple Silicon GPU
 * Platforms: Linux, MacOS, Windows
-* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2)
+* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2), [InternLM](https://github.com/InternLM/InternLM)
 
 ## Getting Started
 
@@ -154,6 +154,26 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o bai
 ```
 </details>
 
+<details>
+<summary>InternLM-Chat-7B</summary>
+
+```sh
+python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b-v1_1 -t q4_0 -o internlm-chat-7b-ggml.bin
+./build/bin/main -m internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+# 你好，我是书生·浦语，有什么可以帮助你的吗？
+```
+</details>
+
+<details>
+<summary>InternLM-Chat-20B</summary>
+
+```sh
+python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o internlm-chat-20b-ggml.bin
+./build/bin/main -m internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+# 你好！有什么我可以帮到你的吗？
+```
+</details>
+
 ## Using BLAS
 
 BLAS library can be integrated to further accelerate matrix multiplication. However, in some cases, using BLAS may cause performance degradation. Whether to turn on BLAS should depend on the benchmarking result.
@@ -293,6 +313,24 @@ python3 web_demo.py -m ../baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --t
 ```
 </details>
 
+<details>
+<summary>InternLM-Chat-7B</summary>
+
+```sh
+python3 cli_chat.py -m ../internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8  # CLI demo
+python3 web_demo.py -m ../internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8  # web demo
+```
+</details>
+
+<details>
+<summary>InternLM-Chat-20B</summary>
+
+```sh
+python3 cli_chat.py -m ../internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo
+python3 web_demo.py -m ../internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo
+```
+</details>
+
 **Load and optimize Hugging Face LLMs in one line of code**
 
 Sometimes it might be inconvenient to convert and save the intermediate GGML models beforehand. Here is an option to directly load from the original Hugging Face model, quantize it into GGML models in a minute, and start serving. All you need is to replace the GGML model path with the Hugging Face model name or path.
@@ -465,6 +503,20 @@ Baichuan-13B / Baichuan2-13B:
 | file size                      | 7.0G  | 7.8G  | 8.5G  | 9.3G  | 14G   | 25G   |
 | mem usage                      | 7.8G  | 8.8G  | 9.5G  | 10G   | 14G   | 25G   |
 
+InternLM-7B:
+
+|                                | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
+|--------------------------------|-------|-------|-------|-------|-------|-------|
+| ms/token (CPU @ Platinum 8260) | 85.3  | 90.1  | 103.5 | 112.5 | 137.3 | 232.2 |
+| ms/token (CUDA @ V100 SXM2)    | 9.1   | 9.4   | 10.5  | 10.5  | 13.3  | 21.1  |
+
+InternLM-20B:
+
+|                                | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
+|--------------------------------|-------|-------|-------|-------|-------|-------|
+| ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A   |
+| ms/token (CUDA @ V100 SXM2)    | 21.6  | 23.2  | 25.0  | 25.9  | 33.4  | N/A   |
+
 ## Development
 
 **Unit Test & Benchmark**
 
@@ -426,6 +426,8 @@ std::string to_string(ModelType model_type) {
         return "Baichuan7B";
     case MODEL_TYPE_BAICHUAN13B:
         return "Baichuan13B";
+    case MODEL_TYPE_INTERNLM:
+        return "InternLM";
     default:
         CHATGLM_THROW << "unknown model type " << model_type;
     }
@@ -1165,6 +1167,174 @@ void Baichuan13BForCausalLM::load(ModelLoader &loader) {
     ctx_.init_device_context();
 }
 
+// ===== InternLM =====
+
+InternLMTokenizer::InternLMTokenizer(std::string_view serialized_model_proto) {
+    const auto status = sp.LoadFromSerializedProto(serialized_model_proto);
+    CHATGLM_CHECK(status.ok()) << status.ToString();
+}
+
+std::vector<int> InternLMTokenizer::encode(const std::string &text, int max_length) const {
+    std::vector<int> ids;
+    sp.Encode(text, &ids);
+    ids.insert(ids.begin(), {bos_token_id}); // special prefix
+    if ((int)ids.size() > max_length) {
+        // sliding window: drop the least recent history while keeping the special prefix
+        int num_drop = (int)ids.size() - max_length;
+        ids.erase(ids.begin() + 1, ids.begin() + 1 + num_drop);
+    }
+    return ids;
+}
+
+std::string InternLMTokenizer::decode(const std::vector<int> &ids) const {
+    // filter out special tokens
+    std::vector<int> normal_ids(ids);
+    normal_ids.erase(std::remove_if(normal_ids.begin(), normal_ids.end(), [this](int id) { return is_special_id(id); }),
+                     normal_ids.end());
+
+    std::string text;
+    sp.Decode(normal_ids, &text);
+    // remove <eoa> and its following
+    size_t eoa_pos = text.find("<eoa>");
+    if (eoa_pos != std::string::npos) {
+        text.erase(eoa_pos);
+    }
+    return text;
+}
+
+std::vector<int> InternLMTokenizer::encode_history(const std::vector<std::string> &history, int max_length) const {
+    std::string prompt = build_prompt(history);
+    std::vector<int> input_ids = encode(prompt, max_length);
+    return input_ids;
+}
+
+std::string InternLMTokenizer::build_prompt(const std::vector<std::string> &history) {
+    CHATGLM_CHECK(history.size() % 2 == 1) << "invalid history size " << history.size();
+
+    std::ostringstream oss_prompt;
+    for (size_t i = 0; i < history.size(); i += 2) {
+        oss_prompt << "<|User|>:" << history[i] << "<eoh>\n<|Bot|>:";
+        if (i < history.size() - 1) {
+            oss_prompt << history[i + 1] << "<eoa>\n";
+        }
+    }
+    return oss_prompt.str();
+}
+
+InternLM7BForCausalLM::InternLM7BForCausalLM(const ModelConfig &config)
+    : BasicModelForCausalLM(MODEL_TYPE_INTERNLM, config, MEM_SIZE, SCRATCH_SIZE) {
+    constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
+    const size_t num_weights = 3 + config.num_hidden_layers * 9;
+    const size_t ctx_w_size = num_weights * tensor_ovhd;
+    const size_t ctx_kv_size = 2 * config.num_hidden_layers *
+                               (config.max_length * config.hidden_size * ggml_type_size(GGML_TYPE_F16) + tensor_ovhd);
+    ctx_.dtype = config.dtype;
+    ctx_.ctx_w = make_unique_ggml_context(ctx_w_size, nullptr, true);
+    ctx_.ctx_kv = make_unique_ggml_context(ctx_kv_size + 1 * MB, nullptr, false); // 1MB extra for MPS
+
+    transformer = InternLM7BModel(&ctx_, config);
+    lm_head = Linear(&ctx_, config.hidden_size, config.vocab_size, false);
+    CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_w.get()) == ggml_get_mem_size(ctx_.ctx_w.get())) << "corrupted model weights";
+    CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_kv.get()) == ctx_kv_size) << "corrupted kv cache";
+
+    // build state_dict
+    state_dict_.reserve(num_weights);
+    state_dict_.emplace_back("model.embed_tokens.weight", transformer.word_embeddings.weight);
+    for (int i = 0; i < config.num_hidden_layers; i++) {
+        std::string layer_prefix = "model.layers." + std::to_string(i) + '.';
+        state_dict_.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight);
+        state_dict_.emplace_back(layer_prefix + "self_attn.qkv_proj.weight",
+                                 transformer.layers[i].attention.query_key_value.weight);
+        if (transformer.layers[i].attention.query_key_value.bias) {
+            state_dict_.emplace_back(layer_prefix + "self_attn.qkv_proj.bias",
+                                     transformer.layers[i].attention.query_key_value.bias);
+        }
+        state_dict_.emplace_back(layer_prefix + "self_attn.o_proj.weight",
+                                 transformer.layers[i].attention.dense.weight);
+        if (transformer.layers[i].attention.dense.bias) {
+            state_dict_.emplace_back(layer_prefix + "self_attn.o_proj.bias",
+                                     transformer.layers[i].attention.dense.bias);
+        }
+        state_dict_.emplace_back(layer_prefix + "post_attention_layernorm.weight",
+                                 transformer.layers[i].post_attention_layernorm.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.down_proj.weight", transformer.layers[i].mlp.down_proj.weight);
+    }
+    state_dict_.emplace_back("model.norm.weight", transformer.final_layernorm.weight);
+    state_dict_.emplace_back("lm_head.weight", lm_head.weight);
+}
+
+void InternLM7BForCausalLM::load(ModelLoader &loader) {
+    for (auto &item : state_dict_) {
+        const std::string &name = item.first;
+        ggml_tensor *tensor = item.second;
+        loader.read_tensor(name, tensor);
+    }
+
+    to_device("model.embed_tokens.weight");
+
+    ctx_.weight_buffer = std::string_view(loader.data, loader.size);
+    ctx_.init_device_context();
+}
+
+InternLM20BForCausalLM::InternLM20BForCausalLM(const ModelConfig &config)
+    : BasicModelForCausalLM(MODEL_TYPE_INTERNLM, config, MEM_SIZE, SCRATCH_SIZE) {
+    constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
+    const size_t num_weights = 3 + config.num_hidden_layers * 7;
+    const size_t ctx_w_size = num_weights * tensor_ovhd;
+    const size_t ctx_kv_size = 2 * config.num_hidden_layers *
+                               (config.max_length * config.hidden_size * ggml_type_size(GGML_TYPE_F16) + tensor_ovhd);
+    ctx_.dtype = config.dtype;
+    ctx_.ctx_w = make_unique_ggml_context(ctx_w_size, nullptr, true);
+    ctx_.ctx_kv = make_unique_ggml_context(ctx_kv_size + 1 * MB, nullptr, false); // 1MB extra for MPS
+
+    transformer = InternLM20BModel(&ctx_, config);
+    lm_head = Linear(&ctx_, config.hidden_size, config.vocab_size, false);
+    CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_w.get()) == ggml_get_mem_size(ctx_.ctx_w.get())) << "corrupted model weights";
+    CHATGLM_CHECK(ggml_used_mem(ctx_.ctx_kv.get()) == ctx_kv_size) << "corrupted kv cache";
+
+    // build state_dict
+    state_dict_.reserve(num_weights);
+    state_dict_.emplace_back("model.embed_tokens.weight", transformer.word_embeddings.weight);
+    for (int i = 0; i < config.num_hidden_layers; i++) {
+        std::string layer_prefix = "model.layers." + std::to_string(i) + '.';
+        state_dict_.emplace_back(layer_prefix + "input_layernorm.weight", transformer.layers[i].input_layernorm.weight);
+        state_dict_.emplace_back(layer_prefix + "self_attn.qkv_proj.weight",
+                                 transformer.layers[i].attention.query_key_value.weight);
+        if (transformer.layers[i].attention.query_key_value.bias) {
+            state_dict_.emplace_back(layer_prefix + "self_attn.qkv_proj.bias",
+                                     transformer.layers[i].attention.query_key_value.bias);
+        }
+        state_dict_.emplace_back(layer_prefix + "self_attn.o_proj.weight",
+                                 transformer.layers[i].attention.dense.weight);
+        if (transformer.layers[i].attention.dense.bias) {
+            state_dict_.emplace_back(layer_prefix + "self_attn.o_proj.bias",
+                                     transformer.layers[i].attention.dense.bias);
+        }
+        state_dict_.emplace_back(layer_prefix + "post_attention_layernorm.weight",
+                                 transformer.layers[i].post_attention_layernorm.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.gate_proj.weight", transformer.layers[i].mlp.gate_proj.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.up_proj.weight", transformer.layers[i].mlp.up_proj.weight);
+        state_dict_.emplace_back(layer_prefix + "mlp.down_proj.weight", transformer.layers[i].mlp.down_proj.weight);
+    }
+    state_dict_.emplace_back("model.norm.weight", transformer.final_layernorm.weight);
+    state_dict_.emplace_back("lm_head.weight", lm_head.weight);
+}
+
+void InternLM20BForCausalLM::load(ModelLoader &loader) {
+    for (auto &item : state_dict_) {
+        const std::string &name = item.first;
+        ggml_tensor *tensor = item.second;
+        loader.read_tensor(name, tensor);
+    }
+
+    to_device("model.embed_tokens.weight");
+
+    ctx_.weight_buffer = std::string_view(loader.data, loader.size);
+    ctx_.init_device_context();
+}
+
 // ===== pipeline =====
 
 Pipeline::Pipeline(const std::string &path) {
@@ -1241,6 +1411,26 @@ Pipeline::Pipeline(const std::string &path) {
         // load model
         model = std::make_unique<Baichuan13BForCausalLM>(config);
         model->load(loader);
+    } else if (model_type == MODEL_TYPE_INTERNLM) {
+        CHATGLM_CHECK(version == 1) << "only support version 1 for now but got " << version;
+
+        // load config
+        ModelConfig config(loader.read_basic<ConfigRecordV1>());
+        config.norm_eps = 1e-6;
+
+        // load tokenizer
+        int proto_size = loader.read_basic<int>();
+        std::string_view serialized_model_proto((char *)mapped_file->data + loader.tell(), proto_size);
+        loader.seek(proto_size, SEEK_CUR);
+        tokenizer = std::make_unique<InternLMTokenizer>(serialized_model_proto);
+
+        // load model
+        if (config.hidden_size == 4096) {
+            model = std::make_unique<InternLM7BForCausalLM>(config);
+        } else {
+            model = std::make_unique<InternLM20BForCausalLM>(config);
+        }
+        model->load(loader);
     } else {
         CHATGLM_THROW << "invalid model type " << model_type;
     }
 
@@ -739,6 +739,7 @@ enum ModelType {
     MODEL_TYPE_CHATGLM2 = 2,
     MODEL_TYPE_BAICHUAN7B = 1024,
     MODEL_TYPE_BAICHUAN13B = 1025,
+    MODEL_TYPE_INTERNLM = 1280,
 };
 
 int get_num_physical_cores();
@@ -1031,6 +1032,69 @@ class Baichuan13BForCausalLM : public BasicModelForCausalLM<Baichuan13BModel> {
     static constexpr size_t SCRATCH_SIZE = 1280 * MB;
 };
 
+// ===== InternLM =====
+
+class InternLMTokenizer : public BaseTokenizer {
+  public:
+    InternLMTokenizer(std::string_view serialized_model_proto);
+
+    std::vector<int> encode(const std::string &text, int max_length) const override;
+
+    std::string decode(const std::vector<int> &ids) const override;
+
+    std::vector<int> encode_history(const std::vector<std::string> &history, int max_length) const override;
+
+    static std::string build_prompt(const std::vector<std::string> &history);
+
+    bool is_special_id(int id) const { return id == unk_token_id || id == bos_token_id || id == eos_token_id; }
+
+  public:
+    sentencepiece::SentencePieceProcessor sp;
+    static constexpr int unk_token_id = 0;
+    static constexpr int bos_token_id = 1;
+    static constexpr int eos_token_id = 2;
+};
+
+using InternLM7BAttention =
+    BasicAttention<true, true, false, BasicRoper<ROPE_TYPE_NEOX, 1>, false, CausalContextMasker>;
+
+using InternLM7BMLP = BasicGLU<ACT_TYPE_SILU, false>;
+
+using InternLM7BBlock = BasicBlock<RMSNorm, InternLM7BAttention, InternLM7BMLP>;
+
+using InternLM7BModel = BasicModel<InternLM7BBlock, RMSNorm, BasicPositionIdsGenerator>;
+
+class InternLM7BForCausalLM : public BasicModelForCausalLM<InternLM7BModel> {
+  public:
+    InternLM7BForCausalLM(const ModelConfig &config);
+
+    void load(ModelLoader &loader) override;
+
+  public:
+    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t SCRATCH_SIZE = 1024 * MB;
+};
+
+using InternLM20BAttention =
+    BasicAttention<false, false, false, BasicRoper<ROPE_TYPE_NEOX, 1>, false, CausalContextMasker>;
+
+using InternLM20BMLP = BasicGLU<ACT_TYPE_SILU, false>;
+
+using InternLM20BBlock = BasicBlock<RMSNorm, InternLM20BAttention, InternLM20BMLP>;
+
+using InternLM20BModel = BasicModel<InternLM20BBlock, RMSNorm, BasicPositionIdsGenerator>;
+
+class InternLM20BForCausalLM : public BasicModelForCausalLM<InternLM20BModel> {
+  public:
+    InternLM20BForCausalLM(const ModelConfig &config);
+
+    void load(ModelLoader &loader) override;
+
+  public:
+    static constexpr size_t MEM_SIZE = 512 * MB;
+    static constexpr size_t SCRATCH_SIZE = 1024 * MB;
+};
+
 // ===== pipeline =====
 
 class Pipeline {
 
@@ -5,7 +5,7 @@
 
 import chatglm_cpp._C as _C
 
-__version__ = "0.2.8"
+__version__ = "0.2.9"
 
 
 class Pipeline(_C.Pipeline):
-Original file line number
+Diff line change
 # model
 *.bin
++
 +# clangd
 +.cache/