|
| 1 | +struct Config : public llama::v3::Config |
| 2 | +{ |
| 3 | + int head_dim; |
| 4 | + int rope_scaling_original_max_position_embeddings; |
| 5 | + float rope_scaling_beta_fast; |
| 6 | + float rope_scaling_beta_slow; |
| 7 | + float rope_scaling_factor; |
| 8 | +}; |
| 9 | + |
| 10 | +class ChatHistoryEncoder : public BaseHistoryEncoder |
| 11 | +{ |
| 12 | +public: |
| 13 | + void append_sys_prompt(std::vector<int> &ids) const override |
| 14 | + { |
| 15 | + std::ostringstream oss; |
| 16 | + ids.push_back(tokenizer->bos_token_id); |
| 17 | + oss << "<|system|>\n" << tokenizer->get_system_prompt() << "\n<|end|>\n"; |
| 18 | + tokenizer->encode(oss.str(), ids); |
| 19 | + } |
| 20 | + void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override |
| 21 | + { |
| 22 | + append_ai_opening(round_idx, ids); |
| 23 | + tokenizer->encode(ai, ids); |
| 24 | + tokenizer->encode("\n<|end|>\n", ids); |
| 25 | + } |
| 26 | + |
| 27 | + void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override |
| 28 | + { |
| 29 | + append_user_opening(round_idx, ids); |
| 30 | + tokenizer->encode(user, ids); |
| 31 | + tokenizer->encode("\n<|end|>\n", ids); |
| 32 | + } |
| 33 | + |
| 34 | + void append_ai_opening(int round_idx, std::vector<int> &ids) const override |
| 35 | + { |
| 36 | + tokenizer->encode("<|assistant|>\n", ids); |
| 37 | + } |
| 38 | + |
| 39 | + void append_user_opening(int round_idx, std::vector<int> &ids) const override |
| 40 | + { |
| 41 | + tokenizer->encode("<|user|>\n", ids); |
| 42 | + } |
| 43 | +}; |
| 44 | + |
| 45 | +static ChatHistoryEncoder _chat_encoder; |
| 46 | + |
| 47 | +class Tokenizer : public BaseTokenizer |
| 48 | +{ |
| 49 | +public: |
| 50 | + Tokenizer(const BaseConfig &config) |
| 51 | + : Tokenizer(config, &_chat_encoder) |
| 52 | + {} |
| 53 | + |
| 54 | + Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder) |
| 55 | + : BaseTokenizer::BaseTokenizer(config, encoder) |
| 56 | + { |
| 57 | + sys_prompt = "You are a helpful AI assistant that provides accurate and concise information."; |
| 58 | + } |
| 59 | + |
| 60 | + size_t load(tokenizer::DataReader *buffer, int n_vocab) override |
| 61 | + { |
| 62 | + tp = new tokenizer::BPEProcessor2( |
| 63 | + { |
| 64 | + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
| 65 | + } |
| 66 | + ); |
| 67 | + size_t size = tp->Load(buffer, n_vocab); |
| 68 | + |
| 69 | + return size; |
| 70 | + } |
| 71 | +}; |
| 72 | + |
| 73 | +class ConditionalGeneration : public llama::v2::GenericConditionalGeneration<LlamaBlock> |
| 74 | +{ |
| 75 | +public: |
| 76 | + ConditionalGeneration() = default; |
| 77 | + ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_APRIEL) |
| 78 | + : llama::v2::GenericConditionalGeneration<LlamaBlock>(config, runtime_config, type, config.num_key_value_heads, config.head_dim, config.max_length, 12, false) |
| 79 | + { |
| 80 | + auto transformer = Base::get_typed_transformer<ModelClass2>(); |
| 81 | + for (int i = 0; i < config.num_hidden_layers; i++) |
| 82 | + { |
| 83 | + auto &attention = transformer->layers[i].attention; |
| 84 | + attention.freq_base = config.rope_theta; |
| 85 | + |
| 86 | + attention.n_original_ctx = config.rope_scaling_original_max_position_embeddings; |
| 87 | + attention.beta_fast = config.rope_scaling_beta_fast; |
| 88 | + attention.beta_slow = config.rope_scaling_beta_slow; |
| 89 | + |
| 90 | + attention.freq_scale = 1 / config.rope_scaling_factor; |
| 91 | + attention.attn_factor = 1.0f; |
| 92 | + attention.ext_factor = 1.0f; |
| 93 | + } |
| 94 | + } |
| 95 | +}; |
0 commit comments