Skip to content

Commit 352e7c8

Browse files
committed
add support of Seed-OSS
1 parent af05a8c commit 352e7c8

File tree

7 files changed

+277
-1
lines changed

7 files changed

+277
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ set(core_files src/backend.cpp
107107
models/phi.cpp
108108
models/qwen.cpp
109109
models/reka.cpp
110+
models/seed.cpp
110111
models/smol.cpp
111112
models/solar.cpp
112113
models/stablelm.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2025-08-22: Seed-OSS
1617
* 2025-08-11: GPT-OSS
1718
* 2025-08-05: Pangu-Embedded
1819
* 2025-07-29: Jiutian

convert.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ class ModelType(Enum):
211211

212212
GPTOSS = 0x2A00
213213

214+
SeedOSS = 0x2B00
215+
214216
BCE_Embedding = 0x10000100
215217
BCE_ReRanker = 0x10000101
216218
BGE_M3 = 0x10000102
@@ -4338,7 +4340,7 @@ def get_weight_names(config):
43384340
f"model.layers.{i}.self_attn.o_proj.weight",
43394341
f"model.layers.{i}.input_layernorm.weight",
43404342
f"model.layers.{i}.post_attention_layernorm.weight",
4341-
f"model.layers.{i}.mlp.down _roj.weight",
4343+
f"model.layers.{i}.mlp.down_proj.weight",
43424344
f"model.layers.{i}.mlp.up_proj.weight",
43434345
f"model.layers.{i}.mlp.gate_proj.weight",
43444346
]
@@ -4354,6 +4356,28 @@ def get_weight_names(config):
43544356

43554357
return weight_names
43564358

4359+
class SeedOSSConverter(BaseConverter):
4360+
MODEL_TYPE = ModelType.SeedOSS
4361+
4362+
@staticmethod
4363+
def dump_config(f, config, ggml_type):
4364+
assert config.attention_bias
4365+
assert not config.attention_out_bias
4366+
assert config.rope_scaling['rope_type'] == 'default'
4367+
assert not config.tie_word_embeddings
4368+
dump_llama_like_config(f, config, ggml_type)
4369+
4370+
config_values = [
4371+
config.num_key_value_heads,
4372+
config.head_dim,
4373+
]
4374+
f.write(struct.pack("i" * len(config_values), *config_values))
4375+
f.write(struct.pack("<f", config.rope_theta))
4376+
4377+
@staticmethod
4378+
def get_weight_names(config):
4379+
return QWen2Converter.get_weight_names(config)
4380+
43574381
class QWen2AudioConverter(BaseConverter):
43584382
MODEL_TYPE = ModelType.Qwen2Audio
43594383

@@ -8020,6 +8044,8 @@ def main():
80208044
JiuTianConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
80218045
elif arch == 'GptOssForCausalLM':
80228046
GptOssConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8047+
elif arch == 'SeedOssForCausalLM':
8048+
SeedOSSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
80238049
elif arch == 'deepseek-r1-distill-qwen3':
80248050
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
80258051
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,11 @@
258258
* [x] Confucius3-Math: [14B](https://huggingface.co/netease-youdao/Confucius3-Math/tree/62621490d5dccf5fea997be9df62dd8dc017f777) (`-a DeepSeek-R1-Distill-QWen`)
259259
* [x] Jan-Nano: [4B](https://huggingface.co/Menlo/Jan-nano/tree/5f4e450c127322db9477400890a0dd951c9f6ab7)
260260

261+
* Seed (`SeedOssForCausalLM`)
262+
* [x] OSS: [36B-Instruct](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct/tree/6f42c8b5bf8f3f687bd6fb28833da03a19867ce8)
263+
264+
Note: Use `--set thinking_budget N` to set `thinking_budget`. Default: -1.
265+
261266
* SmolLM-3 (`SmolLM3ForCausalLM`)
262267
* [x] [3B](https://huggingface.co/HuggingFaceTB/SmolLM3-3B/tree/297fd6336cf21656d5f9d30a1db612ceeca67619)
263268

models/seed.cpp

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
#include "../src/models.h"
2+
#include "../src/models_priv.h"
3+
4+
#define MODEL_TYPE_SEED_OSS (MODEL_TYPE_SEED + 0)
5+
6+
namespace chatllm::seed::oss
7+
{
8+
struct Config : public BaseConfig
9+
{
10+
int num_key_value_heads;
11+
int head_dim;
12+
float rope_theta;
13+
};
14+
15+
class ChatHistoryEncoder : public BaseHistoryEncoder
16+
{
17+
public:
18+
void append_sys_prompt(std::vector<int> &ids) const override;
19+
void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
20+
void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
21+
void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
22+
void append_user_opening(int round_idx, std::vector<int> &ids) const override;
23+
};
24+
25+
static ChatHistoryEncoder _chat_encoder;
26+
27+
class Tokenizer : public BaseTokenizer
28+
{
29+
public:
30+
Tokenizer(const BaseConfig &config)
31+
: Tokenizer(config, &_chat_encoder)
32+
33+
{}
34+
35+
Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder,
36+
BaseHistoryEncoder *qa_encoder = nullptr,
37+
BaseHistoryEncoder *completion_encoder = nullptr)
38+
: BaseTokenizer::BaseTokenizer(config, encoder, qa_encoder, completion_encoder),
39+
thinking_budget(-1), budget_reflections(-1)
40+
{
41+
sys_prompt = "";
42+
}
43+
44+
size_t load(tokenizer::DataReader *buffer, int n_vocab) override;
45+
46+
public:
47+
void encode_role(std::vector<int> &ids, const std::string &role) const;
48+
void encode(std::vector<int> &ids, const std::string &role, const std::string &content) const;
49+
50+
public:
51+
int toolcall_begin_token_id;
52+
int toolcall_end_token_id;
53+
int think_begin_token_id;
54+
int think_end_token_id;
55+
int budget_begin_token_id;
56+
int budget_end_token_id;
57+
int nl_token_id;
58+
public:
59+
int thinking_budget;
60+
int budget_reflections;
61+
};
62+
63+
size_t Tokenizer::load(tokenizer::DataReader *buffer, int n_vocab)
64+
{
65+
tp = new tokenizer::BPEProcessor2(
66+
{
67+
// (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+
68+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])",
69+
"[^\r\n\\p{L}\\p{N}]?\\p{L}+",
70+
"\\p{N}{1}",
71+
" ?[^\\s\\p{L}\\p{N}\r\n]+",
72+
"\\s*[\r\n]+",
73+
"\\s+(?!\\S)",
74+
"\\s+",
75+
}
76+
);
77+
size_t size = tp->Load(buffer, n_vocab);
78+
79+
toolcall_begin_token_id = tp->PieceToId("<seed:tool_call>");
80+
toolcall_end_token_id = tp->PieceToId("</seed:tool_call>");
81+
think_begin_token_id = tp->PieceToId("<seed:think>");
82+
think_end_token_id = tp->PieceToId("</seed:think>");
83+
budget_begin_token_id = tp->PieceToId("<seed:cot_budget_reflect>");
84+
budget_end_token_id = tp->PieceToId("</seed:cot_budget_reflect>");
85+
86+
std::vector<int> ids;
87+
tp->Encode("\n", &ids);
88+
nl_token_id = ids[0];
89+
90+
tp->OverrideTokenDecoding(think_begin_token_id, "<think>");
91+
tp->OverrideTokenDecoding(think_end_token_id, "</think>");
92+
93+
return size;
94+
}
95+
96+
void Tokenizer::encode_role(std::vector<int> &ids, const std::string &role) const
97+
{
98+
ids.push_back(bos_token_id);
99+
BaseTokenizer::encode(role, ids);
100+
ids.push_back(nl_token_id);
101+
}
102+
103+
void Tokenizer::encode(std::vector<int> &ids, const std::string &role, const std::string &content) const
104+
{
105+
ids.push_back(bos_token_id);
106+
BaseTokenizer::encode(role, ids);
107+
ids.push_back(nl_token_id);
108+
BaseTokenizer::encode(content, ids);
109+
ids.push_back(eos_token_id);
110+
}
111+
112+
void ChatHistoryEncoder::append_sys_prompt(std::vector<int> &ids) const
113+
{
114+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
115+
116+
auto s = tok->get_system_prompt();
117+
if (s.size() > 0)
118+
{
119+
tok->encode(ids, "system", s);
120+
}
121+
122+
if (tok->thinking_budget == 0)
123+
{
124+
tok->encode(ids, "system", "You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions.");
125+
}
126+
else if (tok->thinking_budget > 0)
127+
{
128+
const static std::vector<std::pair<int, int>> table =
129+
{
130+
{0, 0},
131+
{512, 128},
132+
{1024, 256},
133+
{2048, 512},
134+
{4096, 512},
135+
{8192, 1024},
136+
{16384, 1024},
137+
};
138+
for (const auto &t : table)
139+
{
140+
if (t.first >= tok->think_begin_token_id)
141+
{
142+
tok->budget_reflections = t.second;
143+
break;
144+
}
145+
}
146+
147+
if (tok->budget_reflections < 0)
148+
tok->budget_reflections = table.back().second;
149+
150+
std::ostringstream oss;
151+
oss << "You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "
152+
<< "\"" << tok->thinking_budget << "\"."
153+
<< "That is, you need to complete your thinking within "
154+
<< tok->thinking_budget
155+
<< " tokens and start answering the user's questions. You will reflect on your thinking process every "
156+
<< tok->budget_reflections
157+
<< " tokens, stating how many tokens have been used and how many are left.";
158+
tok->encode(ids, "system", oss.str());
159+
}
160+
}
161+
162+
void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
163+
{
164+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
165+
tok->encode(ids, "assistant", ai);
166+
}
167+
168+
void ChatHistoryEncoder::append_user(int round_idx, const std::string &user, std::vector<int> &ids) const
169+
{
170+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
171+
tok->encode(ids, "user", user);
172+
}
173+
174+
void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
175+
{
176+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
177+
tok->encode_role(ids, "assistant");
178+
179+
if (tok->thinking_budget == 0)
180+
{
181+
ids.push_back(tok->think_begin_token_id);
182+
ids.push_back(tok->budget_begin_token_id);
183+
}
184+
}
185+
186+
void ChatHistoryEncoder::append_user_opening(int round_idx, std::vector<int> &ids) const
187+
{
188+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
189+
tok->encode_role(ids, "user");
190+
}
191+
192+
class ConditionalGeneration : public BaseModelForConditionalGeneration
193+
{
194+
public:
195+
typedef Model<Config, Embedding, RMSNorm, QWen2Block, int, int, int, int, int, int> ModelClass;
196+
public:
197+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = (ModelType)MODEL_TYPE_SEED_OSS);
198+
199+
void set_additional_args(const std::map<std::string, std::string> &args) override;
200+
public:
201+
Config config;
202+
};
203+
204+
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type)
205+
: BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 2),
206+
config(config)
207+
{
208+
const size_t tensor_ovhd = ggml_tensor_overhead();
209+
const size_t num_tensors = 3 + config.num_hidden_layers * 15;
210+
const size_t ctx_size = num_tensors * tensor_ovhd;
211+
212+
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
213+
w_ctx_.dtype = config.dtype;
214+
215+
transformer = new ModelClass(&w_ctx_, config, false,
216+
config.hidden_size, config.num_attention_heads,
217+
config.intermediate_size, config.num_key_value_heads,
218+
config.head_dim,
219+
config.max_length);
220+
221+
for (int i = 0; i < config.num_hidden_layers; i++)
222+
{
223+
auto &layer = get_typed_transformer<ModelClass>()->layers[i];
224+
layer.attention.freq_base = config.rope_theta;
225+
}
226+
227+
w_ctx_.check_used_mem_size(true);
228+
}
229+
230+
void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)
231+
{
232+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
233+
tok->thinking_budget = utils::get_opt(args, "thinking_budget", tok->thinking_budget);
234+
}
235+
236+
REGISTER_MODEL_LOADER(SEED_OSS, seed::oss, 1);
237+
}

src/layers.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2755,6 +2755,10 @@ namespace chatllm
27552755
QWen2Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int max_length)
27562756
: LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, max_length)
27572757
{}
2758+
2759+
QWen2Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
2760+
: LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
2761+
{}
27582762
};
27592763

27602764
class BlueLMSelfAttention : public RoPESelfAttention<BaseAttention>

src/models_priv.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ namespace chatllm
171171

172172
MODEL_TYPE_OPENAI = 0x2A00,
173173

174+
MODEL_TYPE_SEED = 0x2B00,
175+
174176
MODEL_TYPE_BCE_Embedding = 0x10000100,
175177
MODEL_TYPE_BCE_ReRanker = 0x10000101,
176178
MODEL_TYPE_BGE_M3 = 0x10000102,

0 commit comments

Comments
 (0)