add support of Seed-OSS

foldl · foldl · commit 352e7c800500 · 2025-08-22T17:48:34.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -107,6 +107,7 @@ set(core_files src/backend.cpp
     models/phi.cpp
     models/qwen.cpp
     models/reka.cpp
+    models/seed.cpp
     models/smol.cpp
     models/solar.cpp
     models/stablelm.cpp
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-08-22: Seed-OSS
 * 2025-08-11: GPT-OSS
 * 2025-08-05: Pangu-Embedded
 * 2025-07-29: Jiutian
diff --git a/convert.py b/convert.py
@@ -211,6 +211,8 @@ class ModelType(Enum):
 
     GPTOSS          = 0x2A00
 
+    SeedOSS         = 0x2B00
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -4338,7 +4340,7 @@ def get_weight_names(config):
                 f"model.layers.{i}.self_attn.o_proj.weight",
                 f"model.layers.{i}.input_layernorm.weight",
                 f"model.layers.{i}.post_attention_layernorm.weight",
-                f"model.layers.{i}.mlp.down _roj.weight",
+                f"model.layers.{i}.mlp.down_proj.weight",
                 f"model.layers.{i}.mlp.up_proj.weight",
                 f"model.layers.{i}.mlp.gate_proj.weight",
             ]
@@ -4354,6 +4356,28 @@ def get_weight_names(config):
 
         return weight_names
 
+class SeedOSSConverter(BaseConverter):
+    MODEL_TYPE = ModelType.SeedOSS
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        assert config.attention_bias
+        assert not config.attention_out_bias
+        assert config.rope_scaling['rope_type'] == 'default'
+        assert not config.tie_word_embeddings
+        dump_llama_like_config(f, config, ggml_type)
+
+        config_values = [
+            config.num_key_value_heads,
+            config.head_dim,
+        ]
+        f.write(struct.pack("i" * len(config_values), *config_values))
+        f.write(struct.pack("<f", config.rope_theta))
+
+    @staticmethod
+    def get_weight_names(config):
+        return QWen2Converter.get_weight_names(config)
+
 class QWen2AudioConverter(BaseConverter):
     MODEL_TYPE = ModelType.Qwen2Audio
 
@@ -8020,6 +8044,8 @@ def main():
         JiuTianConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'GptOssForCausalLM':
         GptOssConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'SeedOssForCausalLM':
+        SeedOSSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'deepseek-r1-distill-qwen3':
         QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
diff --git a/docs/models.md b/docs/models.md
@@ -258,6 +258,11 @@
     * [x] Confucius3-Math: [14B](https://huggingface.co/netease-youdao/Confucius3-Math/tree/62621490d5dccf5fea997be9df62dd8dc017f777) (`-a DeepSeek-R1-Distill-QWen`)
     * [x] Jan-Nano: [4B](https://huggingface.co/Menlo/Jan-nano/tree/5f4e450c127322db9477400890a0dd951c9f6ab7)
 
+* Seed (`SeedOssForCausalLM`)
+    * [x] OSS: [36B-Instruct](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct/tree/6f42c8b5bf8f3f687bd6fb28833da03a19867ce8)
+
+        Note: Use `--set thinking_budget N` to set `thinking_budget`. Default: -1.
+
 * SmolLM-3 (`SmolLM3ForCausalLM`)
     * [x] [3B](https://huggingface.co/HuggingFaceTB/SmolLM3-3B/tree/297fd6336cf21656d5f9d30a1db612ceeca67619)
 
diff --git a/models/seed.cpp b/models/seed.cpp
@@ -0,0 +1,237 @@
+#include "../src/models.h"
+#include "../src/models_priv.h"
+
+#define MODEL_TYPE_SEED_OSS     (MODEL_TYPE_SEED + 0)
+
+namespace chatllm::seed::oss
+{
+    struct Config : public BaseConfig
+    {
+        int num_key_value_heads;
+        int head_dim;
+        float rope_theta;
+    };
+
+    class ChatHistoryEncoder : public BaseHistoryEncoder
+    {
+    public:
+        void append_sys_prompt(std::vector<int> &ids) const override;
+        void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
+        void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
+        void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
+        void append_user_opening(int round_idx, std::vector<int> &ids) const override;
+    };
+
+    static ChatHistoryEncoder _chat_encoder;
+
+    class Tokenizer : public BaseTokenizer
+    {
+    public:
+        Tokenizer(const BaseConfig &config)
+            : Tokenizer(config, &_chat_encoder)
+
+        {}
+
+        Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder,
+                BaseHistoryEncoder *qa_encoder = nullptr,
+                BaseHistoryEncoder *completion_encoder = nullptr)
+            : BaseTokenizer::BaseTokenizer(config, encoder, qa_encoder, completion_encoder),
+              thinking_budget(-1), budget_reflections(-1)
+        {
+            sys_prompt = "";
+        }
+
+        size_t load(tokenizer::DataReader *buffer, int n_vocab) override;
+
+    public:
+        void encode_role(std::vector<int> &ids, const std::string &role) const;
+        void encode(std::vector<int> &ids, const std::string &role, const std::string &content) const;
+
+    public:
+        int toolcall_begin_token_id;
+        int toolcall_end_token_id;
+        int think_begin_token_id;
+        int think_end_token_id;
+        int budget_begin_token_id;
+        int budget_end_token_id;
+        int nl_token_id;
+    public:
+        int thinking_budget;
+        int budget_reflections;
+    };
+
+    size_t Tokenizer::load(tokenizer::DataReader *buffer, int n_vocab)
+    {
+        tp = new tokenizer::BPEProcessor2(
+            {
+                // (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+
+                "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])",
+                "[^\r\n\\p{L}\\p{N}]?\\p{L}+",
+                "\\p{N}{1}",
+                " ?[^\\s\\p{L}\\p{N}\r\n]+",
+                "\\s*[\r\n]+",
+                "\\s+(?!\\S)",
+                "\\s+",
+            }
+        );
+        size_t size = tp->Load(buffer, n_vocab);
+
+        toolcall_begin_token_id   = tp->PieceToId("<seed:tool_call>");
+        toolcall_end_token_id     = tp->PieceToId("</seed:tool_call>");
+        think_begin_token_id      = tp->PieceToId("<seed:think>");
+        think_end_token_id        = tp->PieceToId("</seed:think>");
+        budget_begin_token_id     = tp->PieceToId("<seed:cot_budget_reflect>");
+        budget_end_token_id       = tp->PieceToId("</seed:cot_budget_reflect>");
+
+        std::vector<int> ids;
+        tp->Encode("\n", &ids);
+        nl_token_id = ids[0];
+
+        tp->OverrideTokenDecoding(think_begin_token_id, "<think>");
+        tp->OverrideTokenDecoding(think_end_token_id, "</think>");
+
+        return size;
+    }
+
+    void Tokenizer::encode_role(std::vector<int> &ids, const std::string &role) const
+    {
+        ids.push_back(bos_token_id);
+        BaseTokenizer::encode(role, ids);
+        ids.push_back(nl_token_id);
+    }
+
+    void Tokenizer::encode(std::vector<int> &ids, const std::string &role, const std::string &content) const
+    {
+        ids.push_back(bos_token_id);
+        BaseTokenizer::encode(role, ids);
+        ids.push_back(nl_token_id);
+        BaseTokenizer::encode(content, ids);
+        ids.push_back(eos_token_id);
+    }
+
+    void ChatHistoryEncoder::append_sys_prompt(std::vector<int> &ids) const
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+
+        auto s = tok->get_system_prompt();
+        if (s.size() > 0)
+        {
+            tok->encode(ids, "system", s);
+        }
+
+        if (tok->thinking_budget == 0)
+        {
+            tok->encode(ids, "system", "You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions.");
+        }
+        else if (tok->thinking_budget > 0)
+        {
+            const static std::vector<std::pair<int, int>> table =
+            {
+                {0,      0},
+                {512,    128},
+                {1024,   256},
+                {2048,   512},
+                {4096,   512},
+                {8192,   1024},
+                {16384,  1024},
+            };
+            for (const auto &t : table)
+            {
+                if (t.first >= tok->think_begin_token_id)
+                {
+                    tok->budget_reflections = t.second;
+                    break;
+                }
+            }
+
+            if (tok->budget_reflections < 0)
+               tok->budget_reflections = table.back().second;
+
+            std::ostringstream oss;
+            oss << "You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "
+                << "\"" << tok->thinking_budget << "\"."
+                << "That is, you need to complete your thinking within "
+                << tok->thinking_budget
+                << " tokens and start answering the user's questions. You will reflect on your thinking process every "
+                << tok->budget_reflections
+                << " tokens, stating how many tokens have been used and how many are left.";
+            tok->encode(ids, "system", oss.str());
+        }
+    }
+
+    void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        tok->encode(ids, "assistant", ai);
+    }
+
+    void ChatHistoryEncoder::append_user(int round_idx, const std::string &user, std::vector<int> &ids) const
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        tok->encode(ids, "user", user);
+    }
+
+    void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        tok->encode_role(ids, "assistant");
+
+        if (tok->thinking_budget == 0)
+        {
+            ids.push_back(tok->think_begin_token_id);
+            ids.push_back(tok->budget_begin_token_id);
+        }
+    }
+
+    void ChatHistoryEncoder::append_user_opening(int round_idx, std::vector<int> &ids) const
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        tok->encode_role(ids, "user");
+    }
+
+    class ConditionalGeneration : public BaseModelForConditionalGeneration
+    {
+    public:
+        typedef Model<Config, Embedding, RMSNorm, QWen2Block, int, int, int, int, int, int> ModelClass;
+    public:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = (ModelType)MODEL_TYPE_SEED_OSS);
+
+        void set_additional_args(const std::map<std::string, std::string> &args) override;
+    public:
+        Config config;
+    };
+
+    ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type)
+        : BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 2),
+        config(config)
+    {
+        const size_t tensor_ovhd = ggml_tensor_overhead();
+        const size_t num_tensors = 3 + config.num_hidden_layers * 15;
+        const size_t ctx_size = num_tensors * tensor_ovhd;
+
+        w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
+        w_ctx_.dtype = config.dtype;
+
+        transformer = new ModelClass(&w_ctx_, config, false,
+                                        config.hidden_size, config.num_attention_heads,
+                                        config.intermediate_size, config.num_key_value_heads,
+                                        config.head_dim,
+                                        config.max_length);
+
+        for (int i = 0; i < config.num_hidden_layers; i++)
+        {
+            auto &layer = get_typed_transformer<ModelClass>()->layers[i];
+            layer.attention.freq_base = config.rope_theta;
+        }
+
+        w_ctx_.check_used_mem_size(true);
+    }
+
+    void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)
+    {
+        Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
+        tok->thinking_budget            = utils::get_opt(args, "thinking_budget", tok->thinking_budget);
+    }
+
+    REGISTER_MODEL_LOADER(SEED_OSS,              seed::oss, 1);
+}
diff --git a/src/layers.h b/src/layers.h
@@ -2755,6 +2755,10 @@ namespace chatllm
         QWen2Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int max_length)
             : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, max_length)
         {}
+
+        QWen2Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
+            : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
+        {}
     };
 
     class BlueLMSelfAttention : public RoPESelfAttention<BaseAttention>
diff --git a/src/models_priv.h b/src/models_priv.h
@@ -171,6 +171,8 @@ namespace chatllm
 
         MODEL_TYPE_OPENAI           = 0x2A00,
 
+        MODEL_TYPE_SEED             = 0x2B00,
+
         MODEL_TYPE_BCE_Embedding = 0x10000100,
         MODEL_TYPE_BCE_ReRanker  = 0x10000101,
         MODEL_TYPE_BGE_M3        = 0x10000102,