1+ #include " ../src/models.h"
2+ #include " ../src/models_priv.h"
3+
4+ #define MODEL_TYPE_SEED_OSS (MODEL_TYPE_SEED + 0 )
5+
6+ namespace chatllm ::seed::oss
7+ {
8+ struct Config : public BaseConfig
9+ {
10+ int num_key_value_heads;
11+ int head_dim;
12+ float rope_theta;
13+ };
14+
15+ class ChatHistoryEncoder : public BaseHistoryEncoder
16+ {
17+ public:
18+ void append_sys_prompt (std::vector<int > &ids) const override ;
19+ void append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const override ;
20+ void append_user (int round_idx, const std::string &user, std::vector<int > &ids) const override ;
21+ void append_ai_opening (int round_idx, std::vector<int > &ids) const override ;
22+ void append_user_opening (int round_idx, std::vector<int > &ids) const override ;
23+ };
24+
25+ static ChatHistoryEncoder _chat_encoder;
26+
27+ class Tokenizer : public BaseTokenizer
28+ {
29+ public:
30+ Tokenizer (const BaseConfig &config)
31+ : Tokenizer(config, &_chat_encoder)
32+
33+ {}
34+
35+ Tokenizer (const BaseConfig &config, BaseHistoryEncoder *encoder,
36+ BaseHistoryEncoder *qa_encoder = nullptr ,
37+ BaseHistoryEncoder *completion_encoder = nullptr )
38+ : BaseTokenizer::BaseTokenizer(config, encoder, qa_encoder, completion_encoder),
39+ thinking_budget (-1 ), budget_reflections(-1 )
40+ {
41+ sys_prompt = " " ;
42+ }
43+
44+ size_t load (tokenizer::DataReader *buffer, int n_vocab) override ;
45+
46+ public:
47+ void encode_role (std::vector<int > &ids, const std::string &role) const ;
48+ void encode (std::vector<int > &ids, const std::string &role, const std::string &content) const ;
49+
50+ public:
51+ int toolcall_begin_token_id;
52+ int toolcall_end_token_id;
53+ int think_begin_token_id;
54+ int think_end_token_id;
55+ int budget_begin_token_id;
56+ int budget_end_token_id;
57+ int nl_token_id;
58+ public:
59+ int thinking_budget;
60+ int budget_reflections;
61+ };
62+
63+ size_t Tokenizer::load (tokenizer::DataReader *buffer, int n_vocab)
64+ {
65+ tp = new tokenizer::BPEProcessor2 (
66+ {
67+ // (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+
68+ " (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])" ,
69+ " [^\r\n\\ p{L}\\ p{N}]?\\ p{L}+" ,
70+ " \\ p{N}{1}" ,
71+ " ?[^\\ s\\ p{L}\\ p{N}\r\n ]+" ,
72+ " \\ s*[\r\n ]+" ,
73+ " \\ s+(?!\\ S)" ,
74+ " \\ s+" ,
75+ }
76+ );
77+ size_t size = tp->Load (buffer, n_vocab);
78+
79+ toolcall_begin_token_id = tp->PieceToId (" <seed:tool_call>" );
80+ toolcall_end_token_id = tp->PieceToId (" </seed:tool_call>" );
81+ think_begin_token_id = tp->PieceToId (" <seed:think>" );
82+ think_end_token_id = tp->PieceToId (" </seed:think>" );
83+ budget_begin_token_id = tp->PieceToId (" <seed:cot_budget_reflect>" );
84+ budget_end_token_id = tp->PieceToId (" </seed:cot_budget_reflect>" );
85+
86+ std::vector<int > ids;
87+ tp->Encode (" \n " , &ids);
88+ nl_token_id = ids[0 ];
89+
90+ tp->OverrideTokenDecoding (think_begin_token_id, " <think>" );
91+ tp->OverrideTokenDecoding (think_end_token_id, " </think>" );
92+
93+ return size;
94+ }
95+
96+ void Tokenizer::encode_role (std::vector<int > &ids, const std::string &role) const
97+ {
98+ ids.push_back (bos_token_id);
99+ BaseTokenizer::encode (role, ids);
100+ ids.push_back (nl_token_id);
101+ }
102+
103+ void Tokenizer::encode (std::vector<int > &ids, const std::string &role, const std::string &content) const
104+ {
105+ ids.push_back (bos_token_id);
106+ BaseTokenizer::encode (role, ids);
107+ ids.push_back (nl_token_id);
108+ BaseTokenizer::encode (content, ids);
109+ ids.push_back (eos_token_id);
110+ }
111+
112+ void ChatHistoryEncoder::append_sys_prompt (std::vector<int > &ids) const
113+ {
114+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
115+
116+ auto s = tok->get_system_prompt ();
117+ if (s.size () > 0 )
118+ {
119+ tok->encode (ids, " system" , s);
120+ }
121+
122+ if (tok->thinking_budget == 0 )
123+ {
124+ tok->encode (ids, " system" , " You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions." );
125+ }
126+ else if (tok->thinking_budget > 0 )
127+ {
128+ const static std::vector<std::pair<int , int >> table =
129+ {
130+ {0 , 0 },
131+ {512 , 128 },
132+ {1024 , 256 },
133+ {2048 , 512 },
134+ {4096 , 512 },
135+ {8192 , 1024 },
136+ {16384 , 1024 },
137+ };
138+ for (const auto &t : table)
139+ {
140+ if (t.first >= tok->think_begin_token_id )
141+ {
142+ tok->budget_reflections = t.second ;
143+ break ;
144+ }
145+ }
146+
147+ if (tok->budget_reflections < 0 )
148+ tok->budget_reflections = table.back ().second ;
149+
150+ std::ostringstream oss;
151+ oss << " You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "
152+ << " \" " << tok->thinking_budget << " \" ."
153+ << " That is, you need to complete your thinking within "
154+ << tok->thinking_budget
155+ << " tokens and start answering the user's questions. You will reflect on your thinking process every "
156+ << tok->budget_reflections
157+ << " tokens, stating how many tokens have been used and how many are left." ;
158+ tok->encode (ids, " system" , oss.str ());
159+ }
160+ }
161+
162+ void ChatHistoryEncoder::append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const
163+ {
164+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
165+ tok->encode (ids, " assistant" , ai);
166+ }
167+
168+ void ChatHistoryEncoder::append_user (int round_idx, const std::string &user, std::vector<int > &ids) const
169+ {
170+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
171+ tok->encode (ids, " user" , user);
172+ }
173+
174+ void ChatHistoryEncoder::append_ai_opening (int round_idx, std::vector<int > &ids) const
175+ {
176+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
177+ tok->encode_role (ids, " assistant" );
178+
179+ if (tok->thinking_budget == 0 )
180+ {
181+ ids.push_back (tok->think_begin_token_id );
182+ ids.push_back (tok->budget_begin_token_id );
183+ }
184+ }
185+
186+ void ChatHistoryEncoder::append_user_opening (int round_idx, std::vector<int > &ids) const
187+ {
188+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
189+ tok->encode_role (ids, " user" );
190+ }
191+
192+ class ConditionalGeneration : public BaseModelForConditionalGeneration
193+ {
194+ public:
195+ typedef Model<Config, Embedding, RMSNorm, QWen2Block, int , int , int , int , int , int > ModelClass;
196+ public:
197+ ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config, ModelType type = (ModelType)MODEL_TYPE_SEED_OSS);
198+
199+ void set_additional_args (const std::map<std::string, std::string> &args) override ;
200+ public:
201+ Config config;
202+ };
203+
204+ ConditionalGeneration::ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config, ModelType type)
205+ : BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 2 ),
206+ config(config)
207+ {
208+ const size_t tensor_ovhd = ggml_tensor_overhead ();
209+ const size_t num_tensors = 3 + config.num_hidden_layers * 15 ;
210+ const size_t ctx_size = num_tensors * tensor_ovhd;
211+
212+ w_ctx_.gctx = GGMLContext ({.mem_size = ctx_size, .mem_buffer = nullptr , .no_alloc = true });
213+ w_ctx_.dtype = config.dtype ;
214+
215+ transformer = new ModelClass (&w_ctx_, config, false ,
216+ config.hidden_size , config.num_attention_heads ,
217+ config.intermediate_size , config.num_key_value_heads ,
218+ config.head_dim ,
219+ config.max_length );
220+
221+ for (int i = 0 ; i < config.num_hidden_layers ; i++)
222+ {
223+ auto &layer = get_typed_transformer<ModelClass>()->layers [i];
224+ layer.attention .freq_base = config.rope_theta ;
225+ }
226+
227+ w_ctx_.check_used_mem_size (true );
228+ }
229+
230+ void ConditionalGeneration::set_additional_args (const std::map<std::string, std::string> &args)
231+ {
232+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
233+ tok->thinking_budget = utils::get_opt (args, " thinking_budget" , tok->thinking_budget );
234+ }
235+
236+ REGISTER_MODEL_LOADER (SEED_OSS, seed::oss, 1 );
237+ }
0 commit comments