foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 92 additions & 1 deletion b/‎convert.py‎
Lines changed: 92 additions & 1 deletion
diff --git a/‎docs/models.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/models.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/phi.cpp‎
Lines changed: 164 additions & 30 deletions b/‎models/phi.cpp‎
Lines changed: 164 additions & 30 deletions
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2024-07-24: Phi-3.5 Mini & MoE
 * 2024-07-24: Llama 3.1
 * 2024-07-23: Llama-3-Groq with tool calling
 * 2024-07-17: Mistral Nemo
 
@@ -81,6 +81,8 @@ class ModelType(Enum):
     Phi3                = 0x520
     Phi3_ScalingSU      = 0x521
     Phi3_ScalingSU2     = 0x522
+    Phi3_ScalingSU3     = 0x523
+    Phi3MoE_ScalingSU   = 0x530
 
     Mistral = 0x600
     Mixtral = 0x601
@@ -2345,6 +2347,90 @@ def dump_config(f, config, ggml_type):
     def get_weight_names(config):
         return Phi3Converter.get_weight_names(config)
 
+class Phi3SU3Converter(BaseConverter):
+    MODEL_TYPE = ModelType.Phi3_ScalingSU3
+
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        return Phi3SUConverter.state_dict_pp(config, state_dict)
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        Phi3SUConverter.dump_config(f, config, ggml_type)
+
+        config_values = [
+            config.rope_scaling['short_mscale'],
+            config.rope_scaling['long_mscale'],
+        ]
+        f.write(struct.pack('<' + "f" * len(config_values), *config_values))
+
+    @staticmethod
+    def get_weight_names(config):
+        return Phi3SUConverter.get_weight_names(config)
+
+class Phi3MoESUConverter(BaseConverter):
+    MODEL_TYPE = ModelType.Phi3MoE_ScalingSU
+
+    @classmethod
+    def pp(cls, config, name: str, tensor):
+        if name.endswith('k_proj.weight'):
+            return permute(tensor, config.num_key_value_heads)
+        elif name.endswith('q_proj.weight'):
+            return permute(tensor, config.num_attention_heads)
+        else:
+            return tensor
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        assert config.lm_head_bias, 'lm_head_bias must be True'
+
+        Phi3SU3Converter.dump_config(f, config, ggml_type)
+
+        config_values = [
+            config.num_experts_per_tok,
+            config.num_local_experts,
+        ]
+        f.write(struct.pack('<' + "i" * len(config_values), *config_values))
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = ["model.embed_tokens.weight"]
+        for i in range(config.num_hidden_layers):
+            weight_names += [
+                f"model.layers.{i}.input_layernorm.weight",
+                f"model.layers.{i}.input_layernorm.bias",
+            ]
+
+            for j in range(config.num_local_experts):
+                weight_names += [
+                    f"model.layers.{i}.block_sparse_moe.experts.{j}.w1.weight",
+                    f"model.layers.{i}.block_sparse_moe.experts.{j}.w2.weight",
+                    f"model.layers.{i}.block_sparse_moe.experts.{j}.w3.weight",
+                ]
+
+            weight_names += [
+                f"model.layers.{i}.block_sparse_moe.gate.weight",
+                f"model.layers.{i}.post_attention_layernorm.weight",
+                f"model.layers.{i}.post_attention_layernorm.bias",
+                f"model.layers.{i}.self_attn.k_proj.weight",
+                f"model.layers.{i}.self_attn.k_proj.bias",
+                f"model.layers.{i}.self_attn.o_proj.weight",
+                f"model.layers.{i}.self_attn.o_proj.bias",
+                f"model.layers.{i}.self_attn.q_proj.weight",
+                f"model.layers.{i}.self_attn.q_proj.bias",
+                f"model.layers.{i}.self_attn.v_proj.weight",
+                f"model.layers.{i}.self_attn.v_proj.bias",
+            ]
+
+        weight_names += [
+            "model.norm.weight",
+            "model.norm.bias",
+            "lm_head.weight",
+            "lm_head.bias"
+        ]
+
+        return weight_names
+
 class QWenConverter(BaseConverter):
     MODEL_TYPE = ModelType.QWen
     FILE_VERSION = 2
@@ -3677,9 +3763,14 @@ def main():
                 config.rope_scaling['type'] = 'longrope'
 
             if config.rope_scaling['type'] == 'longrope':
-                Phi3SUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+                if 'long_mscale' in config.rope_scaling:
+                    Phi3SU3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
+                else:
+                    Phi3SUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
             else:
                 raise Exception(config.rope_scaling['type'])
+    elif arch == 'PhiMoEForCausalLM':
+        Phi3MoESUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'dolphinphi2':
         Phi2Converter.MODEL_TYPE = ModelType.DolphinPhi2_v2 if config.hidden_act is not None else ModelType.DolphinPhi2
         Phi2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
 
@@ -91,9 +91,9 @@
 
     * [x] [Dolphin Phi-2](https://huggingface.co/cognitivecomputations/dolphin-2_6-phi-2/tree/a084bb141f99f67e8ff56a654e29ddd53a0b4d7a) (`-a DolphinPhi2`) 🐬
 
-    * [x] Phi-3 Mini: [Instruct-4k](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), [Instruct-128k](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)
+    * [x] Phi-3: [Mini-Instruct-4k](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), [Mini-Instruct-128k](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct), [Medium-Instruct-4k](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct), [Medium-Instruct-128k](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
 
-    * [x] Phi-3 Medium: [Instruct-4k](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct), [Instruct-128k](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
+    * [x] Phi-3.5: [Mini-Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct), [MoE-Instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)
 
 * QWen (`QWenLMHeadModel`, `Qwen2ForCausalLM`, `Qwen2MoeForCausalLM`)
     * [x] v1: [Chat-7B](https://huggingface.co/Qwen/Qwen-7B-Chat), [Chat-14B](https://huggingface.co/Qwen/Qwen-14B-Chat), [QAnything-7B](https://huggingface.co/netease-youdao/Qwen-7B-QAnything)
 
@@ -386,6 +386,8 @@ namespace v3
         void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
         void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
         void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
+    public:
+        bool add_bos = true;
     };
 
     static ChatHistoryEncoder _chat_encoder;
@@ -415,6 +417,15 @@ namespace v3
             end_token_id        = tp->PieceToId("<|end|>");
             nl_token_id         = tp->PieceToId("\n");
 
+            if (-1 == system_token_id)
+            {
+                CHATLLM_CHECK(tp->GetPieceSize() == 32000) << " unsupported tokenizer";
+                system_token_id     = 32006;
+                user_token_id       = 32010;
+                assistant_token_id  = 32001;
+                end_token_id        = 32007;
+            }
+
             pad_token_id = eos_token_id;
 
             terminate_ids.insert(end_token_id);
@@ -512,7 +523,9 @@ namespace v3
     {
         Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
 
-        ids.push_back(tok->bos_token_id);
+        if (add_bos)
+            ids.push_back(tok->bos_token_id);
+
         if (tok->get_system_prompt().size() > 0)
             tok->encode(tok->get_system_prompt(), ids, tok->system_token_id, tok->end_token_id);
     }
@@ -577,6 +590,7 @@ namespace v3_su
             {
                 auto &attention = get_typed_transformer<ModelClass>()->layers[i].attention;
                 attention.config(config.original_max_position_embeddings, config.rope_theta,
+                                 scaling_factor,
                                  scaling_factor,
                                  config.hidden_size / config.num_attention_heads / 2,
                                  config.short_factor,
@@ -590,54 +604,174 @@ namespace v3_su2
 {
     typedef v3_su::Config Config;
 
-    class ChatHistoryEncoder : public BaseHistoryEncoder
+    class Tokenizer : public v3::Tokenizer
     {
     public:
-        void append_sys_prompt(std::vector<int> &ids) const override;
-        void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
-        void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
-        void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
+        Tokenizer(const BaseConfig &config) : v3::Tokenizer(config, &v3::_chat_encoder)
+        {
+            append_nl_after_end_tok = true;
+            v3::_chat_encoder.add_bos = false;
+        }
     };
 
-    static ChatHistoryEncoder _chat_encoder;
+    typedef v3_su::ConditionalGeneration ConditionalGeneration;
+}
 
-    class Tokenizer : public v3::Tokenizer
+namespace v3_su3
+{
+    struct Config : public v3_su2::Config
+    {
+        float short_mscale;
+        float long_mscale;
+    };
+
+    typedef v3_su2::Tokenizer Tokenizer;
+
+    class ConditionalGeneration : public v3_su2::ConditionalGeneration
     {
     public:
-        Tokenizer(const BaseConfig &config) : v3::Tokenizer(config, &_chat_encoder)
+        ConditionalGeneration() = default;
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_PHI3_SU3)
+            : v3_su2::ConditionalGeneration(config, runtime_config, type, config.num_key_value_heads, config.max_length)
         {
-            append_nl_after_end_tok = true;
+            for (int i = 0; i < config.num_hidden_layers; i++)
+            {
+                auto &attention = get_typed_transformer<ModelClass>()->layers[i].attention;
+                attention.config(config.original_max_position_embeddings, config.rope_theta,
+                                 config.short_mscale,
+                                 config.long_mscale,
+                                 config.hidden_size / config.num_attention_heads / 2,
+                                 config.short_factor,
+                                 config.long_factor);
+            }
         }
     };
+}
 
-    typedef v3_su::ConditionalGeneration ConditionalGeneration;
+namespace v3_moe
+{
+    struct Config : public v3_su3::Config
+    {
+        int num_experts_per_tok;
+        int num_local_experts;
+    };
 
-    void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
+    typedef v3_su3::Tokenizer Tokenizer;
+
+    template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class Phi3SparseMoE : public BaseSparseMLP
     {
-        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
-        append_ai_opening(round_idx, ids);
-        tok->encode(ai, ids, -1, tok->end_token_id);
-    }
+    public:
+        Phi3SparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
+            : BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false)
+        {
+        }
+    };
 
-    void ChatHistoryEncoder::append_sys_prompt(std::vector<int> &ids) const
+    class Phi3SUSelfAttentionBiased : public Phi3SUSelfAttention
     {
-        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+    public:
+        Phi3SUSelfAttentionBiased(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length)
+            : Phi3SUSelfAttention(ctx, hidden_size, num_attention_heads, num_kv_heads, max_length, true, true)
+        {}
+    };
 
-        if (tok->get_system_prompt().size() > 0)
-            tok->encode(tok->get_system_prompt(), ids, tok->system_token_id, tok->end_token_id);
-    }
+    template<int num_local_experts, int num_experts_per_tok> class Phi3MoEBlock : public LMBlock1<LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
+                        Phi3SparseMoE<num_local_experts, num_experts_per_tok>>
+    {
+    public:
+        Phi3MoEBlock(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int max_length)
+            : LMBlock1<LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
+                       Phi3SparseMoE<num_local_experts, num_experts_per_tok>>(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, max_length)
+        {}
+    };
 
-    void ChatHistoryEncoder::append_user(int round_idx, const std::string &user, std::vector<int> &ids) const
+    template<int _NUM_EXPERTS, int _EXPERTS_PER_TOK, ModelType type> class _ConditionalGeneration : public BaseModelForConditionalGeneration
     {
-        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+    public:
+        typedef BaseModelForConditionalGeneration Base;
+        typedef Model<Config, Embedding, LayerNorm, Phi3MoEBlock<_NUM_EXPERTS, _EXPERTS_PER_TOK>, int, int, int, int, int> ModelClass;
+    public:
+        _ConditionalGeneration() = default;
 
-        tok->encode(user, ids, tok->user_token_id, tok->end_token_id);
-    }
+        _ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
+        : Base(type, config, runtime_config), config(config)
+        {
+            constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
+            const size_t num_tensors = 3 + 2 + config.num_hidden_layers * (11 + 3 + 5);
+            const size_t ctx_size = num_tensors * tensor_ovhd;
+            w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
+            w_ctx_.dtype = config.dtype;
 
-    void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
-    {
-        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+            CHATLLM_CHECK((_NUM_EXPERTS == config.num_local_experts) && (_EXPERTS_PER_TOK == config.num_experts_per_tok))
+                << "unsupported MoE param";
 
-        tok->encode("", ids, tok->assistant_token_id, -1);
-    }
+            Base::GRAPH_SIZE = 4096 * 2;
+
+            Base::transformer = new ModelClass(
+                                &w_ctx_, config, true,
+                                config.hidden_size, config.num_attention_heads,
+                                config.intermediate_size, config.num_key_value_heads, config.max_length);
+
+            for (int i = 0; i < config.num_hidden_layers; i++)
+            {
+                auto &attention = Base::get_typed_transformer<ModelClass>()->layers[i].attention;
+                attention.config(config.original_max_position_embeddings, config.rope_theta,
+                                 config.short_mscale,
+                                 config.long_mscale,
+                                 config.hidden_size / config.num_attention_heads / 2,
+                                 config.short_factor,
+                                 config.long_factor);
+            }
+
+            CHATLLM_CHECK(w_ctx_.get_used_mem() == w_ctx_.get_mem_size()) << "corrupted model weights";
+        }
+
+        void load(ModelLoader &loader) override
+        {
+            auto transformer = get_typed_transformer<ModelClass>();
+            loader.read_tensor("model.embed_tokens.weight", transformer->word_embeddings.weight);
+            for (int i = 0; i < config.num_hidden_layers; i++)
+            {
+                std::string layer_prefix = "model.layers." + std::to_string(Base::layer_ids[i]) + '.';
+
+                loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w2.weight", transformer->layers[i].mlp.experts_down.weight);
+                loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w1.weight", transformer->layers[i].mlp.experts_gate.weight);
+                loader.read_tensor(layer_prefix + "mlp.experts_up.weight",   layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w3.weight", transformer->layers[i].mlp.experts_up.weight);
+
+                loader.read_tensor(layer_prefix + "block_sparse_moe.gate.weight",
+                                transformer->layers[i].mlp.gate.weight);
+
+                loader.read_tensor(layer_prefix + "input_layernorm.weight",
+                                transformer->layers[i].input_layernorm.weight);
+                loader.read_tensor(layer_prefix + "input_layernorm.bias",
+                                transformer->layers[i].input_layernorm.bias);
+
+                loader.read_tensor(layer_prefix + "post_attention_layernorm.weight",
+                                transformer->layers[i].post_attention_layernorm.weight);
+                loader.read_tensor(layer_prefix + "post_attention_layernorm.bias",
+                                transformer->layers[i].post_attention_layernorm.bias);
+
+                loader.read_tensor(layer_prefix + "self_attn.k_proj.weight", transformer->layers[i].attention.k_proj.weight);
+                loader.read_tensor(layer_prefix + "self_attn.k_proj.bias",   transformer->layers[i].attention.k_proj.bias);
+                loader.read_tensor(layer_prefix + "self_attn.o_proj.weight", transformer->layers[i].attention.o_proj.weight);
+                loader.read_tensor(layer_prefix + "self_attn.o_proj.bias",   transformer->layers[i].attention.o_proj.bias);
+                loader.read_tensor(layer_prefix + "self_attn.q_proj.weight", transformer->layers[i].attention.q_proj.weight);
+                loader.read_tensor(layer_prefix + "self_attn.q_proj.bias",   transformer->layers[i].attention.q_proj.bias);
+                loader.read_tensor(layer_prefix + "self_attn.v_proj.weight", transformer->layers[i].attention.v_proj.weight);
+                loader.read_tensor(layer_prefix + "self_attn.v_proj.bias",   transformer->layers[i].attention.v_proj.bias);
+            }
+            loader.read_tensor("model.norm.weight", transformer->final_layernorm.weight);
+            loader.read_tensor("model.norm.bias",   transformer->final_layernorm.bias);
+            loader.read_tensor("lm_head.weight", dynamic_cast<Linear *>(transformer->lm_head)->weight);
+            loader.read_tensor("lm_head.bias",   dynamic_cast<Linear *>(transformer->lm_head)->bias);
+        }
+
+    public:
+        Config config;
+    };
+
+    const int NUM_EXPERTS                   =  16;
+    const int EXPERTS_PER_TOK               =  2;
+
+    typedef _ConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, MODEL_TYPE_PHI3_MOE> ConditionalGeneration;
 }