Skip to content

Commit 5289f4f

Browse files
committed
Add PLaMo2 models
1 parent a682474 commit 5289f4f

File tree

7 files changed

+4905
-4919
lines changed

7 files changed

+4905
-4919
lines changed

include/llama.h

Lines changed: 167 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,29 @@
4747
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
4848
#define LLAMA_STATE_SEQ_VERSION 2
4949

50+
#ifdef __cplusplus
51+
#include <vector>
52+
#include <string>
53+
#include <array> // Added for std::array
54+
55+
// These enums need to be defined before struct llama_hparams
56+
enum llama_swa_type {
57+
LLAMA_SWA_TYPE_UNSPECIFIED = -1,
58+
LLAMA_SWA_TYPE_NONE = 0,
59+
LLAMA_SWA_TYPE_STANDARD = 1, // standard SWA (used by Gemma-2)
60+
LLAMA_SWA_TYPE_CHUNKED = 2, // chunked SWA (used by Llama 4)
61+
};
62+
63+
enum llama_expert_gating_func_type {
64+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
65+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
66+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
67+
};
68+
#endif // __cplusplus // Closes the block for C++ specific includes and enums
69+
5070
#ifdef __cplusplus
5171
extern "C" {
5272
#endif
53-
5473
//
5574
// C interface
5675
//
@@ -1372,6 +1391,152 @@ extern "C" {
13721391

13731392
#ifdef __cplusplus
13741393
}
1375-
#endif
1394+
1395+
// Internal llama_hparams
1396+
// NOTE: must be C-compatible
1397+
// TODO: remove this C-compatibility requirement
1398+
#include <cstdint>
1399+
#include <cstddef>
1400+
// #include <vector> // already included above
1401+
// #include <string> // already included above
1402+
// #include <array> // already included above
1403+
1404+
// Max number of layers that can be stored in llama_hparams arrays
1405+
#define LLAMA_MAX_LAYERS 256
1406+
1407+
struct llama_hparams {
1408+
uint32_t n_vocab = 0;
1409+
uint32_t n_ctx_train = 0; // context size used during training
1410+
uint32_t n_embd = 0;
1411+
uint32_t n_layer = 0;
1412+
uint32_t n_rot = 0;
1413+
uint32_t n_ff_exp = 0; // feed-forward length for experts
1414+
uint32_t n_ff_shexp = 0; // feed-forward length for shared experts
1415+
uint32_t n_expert = 0;
1416+
uint32_t n_expert_used = 0;
1417+
uint32_t n_expert_shared = 0;
1418+
uint32_t n_embd_head_k = 0; // dimension of key heads
1419+
uint32_t n_embd_head_v = 0; // dimension of value heads
1420+
// uint32_t n_embd_k_gqa = 0; // dimension of key GQA // REMOVED
1421+
// uint32_t n_embd_v_gqa = 0; // dimension of value GQA // REMOVED
1422+
uint32_t n_embd_features = 0; // dimension of features for wavtokenizer
1423+
uint32_t n_layer_dense_lead = 0; // number of leading dense layers for MoE models
1424+
uint32_t n_moe_layer_step = 0; // step between MoE layers
1425+
uint32_t n_lora_q = 0;
1426+
uint32_t n_lora_kv = 0;
1427+
uint32_t n_lora_decay = 0;
1428+
uint32_t n_lora_iclr = 0;
1429+
uint32_t n_lora_value_res_mix = 0;
1430+
uint32_t n_lora_gate = 0;
1431+
uint32_t n_rel_attn_bkts = 0;
1432+
uint32_t n_no_rope_layer_step = 0;
1433+
uint32_t n_token_types = 0;
1434+
uint32_t n_swa = 0; // sliding window attention size
1435+
uint32_t n_swa_pattern = 0; // sliding window attention pattern
1436+
uint32_t wkv_head_size = 0;
1437+
uint32_t time_mix_extra_dim = 0;
1438+
uint32_t time_decay_extra_dim = 0;
1439+
uint32_t rescale_every_n_layers = 0;
1440+
uint32_t token_shift_count = 0;
1441+
uint32_t n_embd_head_k_mla = 0; // dimension of key heads for MLA
1442+
uint32_t n_embd_head_v_mla = 0; // dimension of value heads for MLA
1443+
uint32_t ssm_d_conv = 0; // SSM conv dimension
1444+
uint32_t ssm_d_inner = 0; // SSM inner dimension
1445+
uint32_t ssm_d_state = 0; // SSM state dimension
1446+
uint32_t ssm_dt_rank = 0; // SSM time step rank
1447+
uint32_t moe_every_n_layers = 0; // MoE layer interval
1448+
1449+
float f_norm_eps = 0.0f; // rmsnorm eps
1450+
float f_norm_rms_eps = 0.0f; // rmsnorm eps
1451+
float f_norm_group_eps = 0.0f; // group norm eps
1452+
uint32_t n_norm_groups = 0; // group norm groups
1453+
float f_clamp_kqv = 0.0f; // clamp kqv
1454+
float f_max_alibi_bias = 0.0f; // max alibi bias
1455+
float f_logit_scale = 0.0f; // logit scale
1456+
float f_attention_scale = 0.0f; // attention scale
1457+
float f_embedding_scale = 0.0f; // embedding scale
1458+
float f_residual_scale = 0.0f; // residual scale
1459+
float f_attn_logit_softcapping = 0.0f; // attention logit softcapping
1460+
float f_final_logit_softcapping = 0.0f; // final logit softcapping
1461+
float n_attn_temp_floor_scale = 0.0f; // attention temperature floor scale
1462+
float f_attn_temp_scale = 0.0f; // attention temperature scale
1463+
float expert_weights_scale = 0.0f; // expert weights scale
1464+
1465+
bool use_par_res = false; // parallel residual
1466+
bool causal_attn = true; // causal attention
1467+
bool rope_finetuned = false; // rope finetuned
1468+
bool swin_norm = false; // swin norm
1469+
bool attn_soft_cap = false; // attention soft capping
1470+
bool ssm_dt_b_c_rms = false; // ssm dt_b_c_rms
1471+
bool use_kq_norm = true; // use kq norm
1472+
bool expert_weights_norm = false; // expert weights norm
1473+
bool use_alibi = false; // use ALiBi
1474+
bool vocab_only = false; // only load vocabulary
1475+
1476+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
1477+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1478+
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; // C++ enum, keyword 'enum' omitted
1479+
llama_expert_gating_func_type expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; // C++ enum, keyword 'enum' omitted
1480+
1481+
float rope_freq_base_train = 0.0f;
1482+
float rope_freq_scale_train = 0.0f;
1483+
float rope_freq_base_train_swa = 0.0f;
1484+
float rope_freq_scale_train_swa = 0.0f;
1485+
float rope_attn_factor = 0.0f;
1486+
float rope_yarn_log_mul = 0.0f;
1487+
uint32_t n_ctx_orig_yarn = 0;
1488+
llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1489+
1490+
llama_token dec_start_token_id = -1; // decoder start token id
1491+
1492+
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
1493+
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
1494+
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
1495+
std::array<uint32_t, LLAMA_MAX_LAYERS> rope_sections;
1496+
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
1497+
std::vector<std::string> layers_block_type_arr;
1498+
1499+
1500+
// posnet / convnext hparams
1501+
struct {
1502+
uint32_t n_embd = 0;
1503+
uint32_t n_layer = 0;
1504+
} posnet;
1505+
1506+
struct {
1507+
uint32_t n_embd = 0;
1508+
uint32_t n_layer = 0;
1509+
} convnext;
1510+
1511+
// helper functions
1512+
uint32_t n_head (uint32_t il = 0) const { return n_head_arr [il % n_head_arr.size()]; }
1513+
uint32_t n_head_kv (uint32_t il = 0) const { return n_head_kv_arr[il % n_head_kv_arr.size()]; }
1514+
uint32_t n_ff (uint32_t il = 0) const { return n_ff_arr [il % n_ff_arr.size()]; }
1515+
uint32_t n_gqa (uint32_t il = 0) const { return n_head(il)/n_head_kv(il); }
1516+
uint32_t n_embd_k_gqa(uint32_t il = 0) const { return n_embd_head_k * n_head_kv(il); } // dimension of K (w/ GQA)
1517+
uint32_t n_embd_v_gqa(uint32_t il = 0) const { return n_embd_head_v * n_head_kv(il); } // dimension of V (w/ GQA)
1518+
uint32_t n_embd_k_s() const; // dimension of recurrent state for K
1519+
uint32_t n_embd_v_s() const; // dimension of recurrent state for V
1520+
1521+
bool is_swa(uint32_t il) const {
1522+
return swa_layers[il % swa_layers.size()] != 0;
1523+
}
1524+
1525+
bool is_swa_any() const {
1526+
for (uint32_t il = 0; il < n_layer; ++il) {
1527+
if (is_swa(il)) {
1528+
return true;
1529+
}
1530+
}
1531+
return false;
1532+
}
1533+
1534+
void set_swa_pattern(uint32_t pattern) {
1535+
for (uint32_t il = 0; il < n_layer; ++il) {
1536+
swa_layers[il] = (il + 1) % pattern == 0 ? 0 : 1;
1537+
}
1538+
}
1539+
};
1540+
#endif // __cplusplus
13761541

13771542
#endif // LLAMA_H

src/llama-arch.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
3333
{ LLM_ARCH_PHI3, "phi3" },
3434
{ LLM_ARCH_PHIMOE, "phimoe" },
3535
{ LLM_ARCH_PLAMO, "plamo" },
36+
{ LLM_ARCH_PLAMO2, "plamo2" },
3637
{ LLM_ARCH_CODESHELL, "codeshell" },
3738
{ LLM_ARCH_ORION, "orion" },
3839
{ LLM_ARCH_INTERNLM2, "internlm2" },
@@ -121,6 +122,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
121122
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
122123
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
123124
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
125+
{ LLM_KV_LAYERS_BLOCK_TYPE, "%s.layers_block_type" },
124126

125127
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
126128
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -733,6 +735,42 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
733735
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
734736
},
735737
},
738+
{
739+
LLM_ARCH_PLAMO2,
740+
{
741+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
742+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
743+
{ LLM_TENSOR_OUTPUT, "output" },
744+
745+
// Common layer norms
746+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, // pre_mixer_norm
747+
{ LLM_TENSOR_POST_ATTN_NORM, "blk.%d.post_attn_norm" }, // post_mixer_norm
748+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, // pre_mlp_norm
749+
{ LLM_TENSOR_POST_MLP_NORM, "blk.%d.post_mlp_norm" }, // post_mlp_norm
750+
751+
// FFN tensors (common to all layers)
752+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
753+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
754+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
755+
756+
// Attention-specific tensors
757+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
758+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
759+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
760+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
761+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
762+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
763+
764+
// Mamba-specific tensors (SSM)
765+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
766+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
767+
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
768+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
769+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
770+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
771+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
772+
},
773+
},
736774
{
737775
LLM_ARCH_CODESHELL,
738776
{

src/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ enum llm_arch {
3737
LLM_ARCH_PHI3,
3838
LLM_ARCH_PHIMOE,
3939
LLM_ARCH_PLAMO,
40+
LLM_ARCH_PLAMO2,
4041
LLM_ARCH_CODESHELL,
4142
LLM_ARCH_ORION,
4243
LLM_ARCH_INTERNLM2,
@@ -125,6 +126,7 @@ enum llm_kv {
125126
LLM_KV_EMBEDDING_SCALE,
126127
LLM_KV_TOKEN_SHIFT_COUNT,
127128
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
129+
LLM_KV_LAYERS_BLOCK_TYPE,
128130

129131
LLM_KV_ATTENTION_HEAD_COUNT,
130132
LLM_KV_ATTENTION_HEAD_COUNT_KV,

src/llama-hparams.cpp

Lines changed: 2 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2,69 +2,7 @@
22

33
#include "ggml.h"
44

5-
void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
6-
for (uint32_t il = 0; il < n_layer; ++il) {
7-
swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
8-
}
9-
}
10-
11-
bool llama_hparams::is_swa_any() const {
12-
for (uint32_t il = 0; il < n_layer; ++il) {
13-
if (swa_layers[il]) {
14-
return true;
15-
}
16-
}
17-
18-
return false;
19-
}
20-
21-
uint32_t llama_hparams::n_head(uint32_t il) const {
22-
if (il < n_layer) {
23-
return n_head_arr[il];
24-
}
25-
26-
GGML_ABORT("fatal error");
27-
}
28-
29-
uint32_t llama_hparams::n_head_kv(uint32_t il) const {
30-
if (il < n_layer) {
31-
return n_head_kv_arr[il];
32-
}
33-
34-
GGML_ABORT("fatal error");
35-
}
36-
37-
uint32_t llama_hparams::n_ff(uint32_t il) const {
38-
if (il < n_layer) {
39-
return n_ff_arr[il];
40-
}
41-
42-
GGML_ABORT("fatal error");
43-
}
44-
45-
uint32_t llama_hparams::n_gqa(uint32_t il) const {
46-
const uint32_t n_head = this->n_head(il);
47-
const uint32_t n_head_kv = this->n_head_kv(il);
48-
49-
if (n_head_kv == 0) {
50-
return 0;
51-
}
52-
53-
return n_head/n_head_kv;
54-
}
55-
56-
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
57-
const uint32_t n_head_kv = this->n_head_kv(il);
58-
59-
return n_embd_head_k * n_head_kv;
60-
}
61-
62-
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
63-
const uint32_t n_head_kv = this->n_head_kv(il);
64-
65-
return n_embd_head_v * n_head_kv;
66-
}
67-
5+
// Only define functions that are not already inline in the header
686
uint32_t llama_hparams::n_embd_k_s() const {
697
if (wkv_head_size != 0) {
708
// for RWKV models
@@ -84,12 +22,4 @@ uint32_t llama_hparams::n_embd_v_s() const {
8422

8523
// corresponds to Mamba's ssm_states size
8624
return ssm_d_state * ssm_d_inner;
87-
}
88-
89-
bool llama_hparams::is_swa(uint32_t il) const {
90-
if (il < n_layer) {
91-
return swa_layers[il];
92-
}
93-
94-
GGML_ABORT("fatal error");
95-
}
25+
}

0 commit comments

Comments
 (0)