1+ namespace v2
2+ {
3+ struct Config : public glm ::v2::Config
4+ {
5+ };
6+
7+ class ChatHistoryEncoder : public BaseHistoryEncoder
8+ {
9+ public:
10+ void do_append_user (int round_idx, const std::string &user, std::vector<int > &ids) const override ;
11+ };
12+
13+ static ChatHistoryEncoder _chat_encoder;
14+
15+ class Tokenizer : public glm ::v2::Tokenizer
16+ {
17+ public:
18+ Tokenizer (const Config &config) : glm::v2::Tokenizer::Tokenizer(config, &_chat_encoder)
19+ {
20+ sys_prompt = " # language: Python" ;
21+ }
22+ };
23+
24+ class ConditionalGeneration : public glm ::v2::ConditionalGeneration
25+ {
26+ public:
27+ ConditionalGeneration () = default ;
28+ ConditionalGeneration (const Config &config)
29+ : glm::v2::ConditionalGeneration(config, MODEL_TYPE_CODEGEEX2)
30+ {
31+ }
32+ };
33+
34+ void ChatHistoryEncoder::do_append_user (int round_idx, const std::string &user, std::vector<int > &ids) const
35+ {
36+ std::string combined = tokenizer->get_system_prompt () + " \n " + user + " \n " ;
37+ tokenizer->encode (combined, ids);
38+ }
39+ }
40+
41+ namespace v4
42+ {
43+ typedef glm::v4::Config Config;
44+
45+ class Tokenizer : public glm ::v4::Tokenizer
46+ {
47+ public:
48+ Tokenizer (const Config &config) : glm::v4::Tokenizer(config)
49+ {}
50+
51+ size_t load (tokenizer::DataReader *buffer, int n_vocab) override
52+ {
53+ size_t r = glm::v4::Tokenizer::load (buffer, n_vocab);
54+ int special_id = observation_token_id + 5 ;
55+ code_prefix_token_id = special_id++;
56+ code_middle_token_id = special_id++;
57+ code_suffix_token_id = special_id++;
58+ cursor_token_id = special_id++;
59+ tp->AddAddedToken (" <|code_prefix|>" , code_prefix_token_id);
60+ tp->AddAddedToken (" <|code_middle|>" , code_middle_token_id);
61+ tp->AddAddedToken (" <|code_suffix|>" , code_suffix_token_id);
62+ tp->AddAddedToken (" <|cursor|>" , cursor_token_id);
63+ return r;
64+ }
65+ public:
66+ int code_prefix_token_id;
67+ int code_middle_token_id;
68+ int code_suffix_token_id;
69+ int cursor_token_id;
70+ };
71+
72+ class ConditionalGeneration : public glm ::v4::ConditionalGeneration
73+ {
74+ public:
75+ ConditionalGeneration (const Config &config)
76+ : glm::v4::ConditionalGeneration(config, MODEL_TYPE_CODEGEEX4)
77+ {
78+ }
79+
80+ // FIXME: this mode seems not support tool calling actually
81+ // https://github.com/THUDM/CodeGeeX4/issues/8
82+ ChunkInterceptor *get_interceptor (void ) override { return nullptr ; }
83+ };
84+ }
0 commit comments