|
47 | 47 | #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ |
48 | 48 | #define LLAMA_STATE_SEQ_VERSION 2 |
49 | 49 |
|
| 50 | +#ifdef __cplusplus |
| 51 | +#include <vector> |
| 52 | +#include <string> |
| 53 | +#include <array> // Added for std::array |
| 54 | + |
| 55 | +// These enums need to be defined before struct llama_hparams |
| 56 | +enum llama_swa_type { |
| 57 | + LLAMA_SWA_TYPE_UNSPECIFIED = -1, |
| 58 | + LLAMA_SWA_TYPE_NONE = 0, |
| 59 | + LLAMA_SWA_TYPE_STANDARD = 1, // standard SWA (used by Gemma-2) |
| 60 | + LLAMA_SWA_TYPE_CHUNKED = 2, // chunked SWA (used by Llama 4) |
| 61 | +}; |
| 62 | + |
| 63 | +enum llama_expert_gating_func_type { |
| 64 | + LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, |
| 65 | + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1, |
| 66 | + LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2, |
| 67 | +}; |
| 68 | +#endif // __cplusplus // Closes the block for C++ specific includes and enums |
| 69 | + |
50 | 70 | #ifdef __cplusplus |
51 | 71 | extern "C" { |
52 | 72 | #endif |
53 | | - |
54 | 73 | // |
55 | 74 | // C interface |
56 | 75 | // |
@@ -1372,6 +1391,152 @@ extern "C" { |
1372 | 1391 |
|
1373 | 1392 | #ifdef __cplusplus |
1374 | 1393 | } |
1375 | | -#endif |
| 1394 | + |
| 1395 | +// Internal llama_hparams |
| 1396 | +// NOTE: must be C-compatible |
| 1397 | +// TODO: remove this C-compatibility requirement |
| 1398 | +#include <cstdint> |
| 1399 | +#include <cstddef> |
| 1400 | +// #include <vector> // already included above |
| 1401 | +// #include <string> // already included above |
| 1402 | +// #include <array> // already included above |
| 1403 | + |
| 1404 | +// Max number of layers that can be stored in llama_hparams arrays |
| 1405 | +#define LLAMA_MAX_LAYERS 256 |
| 1406 | + |
| 1407 | +struct llama_hparams { |
| 1408 | + uint32_t n_vocab = 0; |
| 1409 | + uint32_t n_ctx_train = 0; // context size used during training |
| 1410 | + uint32_t n_embd = 0; |
| 1411 | + uint32_t n_layer = 0; |
| 1412 | + uint32_t n_rot = 0; |
| 1413 | + uint32_t n_ff_exp = 0; // feed-forward length for experts |
| 1414 | + uint32_t n_ff_shexp = 0; // feed-forward length for shared experts |
| 1415 | + uint32_t n_expert = 0; |
| 1416 | + uint32_t n_expert_used = 0; |
| 1417 | + uint32_t n_expert_shared = 0; |
| 1418 | + uint32_t n_embd_head_k = 0; // dimension of key heads |
| 1419 | + uint32_t n_embd_head_v = 0; // dimension of value heads |
| 1420 | + // uint32_t n_embd_k_gqa = 0; // dimension of key GQA // REMOVED |
| 1421 | + // uint32_t n_embd_v_gqa = 0; // dimension of value GQA // REMOVED |
| 1422 | + uint32_t n_embd_features = 0; // dimension of features for wavtokenizer |
| 1423 | + uint32_t n_layer_dense_lead = 0; // number of leading dense layers for MoE models |
| 1424 | + uint32_t n_moe_layer_step = 0; // step between MoE layers |
| 1425 | + uint32_t n_lora_q = 0; |
| 1426 | + uint32_t n_lora_kv = 0; |
| 1427 | + uint32_t n_lora_decay = 0; |
| 1428 | + uint32_t n_lora_iclr = 0; |
| 1429 | + uint32_t n_lora_value_res_mix = 0; |
| 1430 | + uint32_t n_lora_gate = 0; |
| 1431 | + uint32_t n_rel_attn_bkts = 0; |
| 1432 | + uint32_t n_no_rope_layer_step = 0; |
| 1433 | + uint32_t n_token_types = 0; |
| 1434 | + uint32_t n_swa = 0; // sliding window attention size |
| 1435 | + uint32_t n_swa_pattern = 0; // sliding window attention pattern |
| 1436 | + uint32_t wkv_head_size = 0; |
| 1437 | + uint32_t time_mix_extra_dim = 0; |
| 1438 | + uint32_t time_decay_extra_dim = 0; |
| 1439 | + uint32_t rescale_every_n_layers = 0; |
| 1440 | + uint32_t token_shift_count = 0; |
| 1441 | + uint32_t n_embd_head_k_mla = 0; // dimension of key heads for MLA |
| 1442 | + uint32_t n_embd_head_v_mla = 0; // dimension of value heads for MLA |
| 1443 | + uint32_t ssm_d_conv = 0; // SSM conv dimension |
| 1444 | + uint32_t ssm_d_inner = 0; // SSM inner dimension |
| 1445 | + uint32_t ssm_d_state = 0; // SSM state dimension |
| 1446 | + uint32_t ssm_dt_rank = 0; // SSM time step rank |
| 1447 | + uint32_t moe_every_n_layers = 0; // MoE layer interval |
| 1448 | + |
| 1449 | + float f_norm_eps = 0.0f; // rmsnorm eps |
| 1450 | + float f_norm_rms_eps = 0.0f; // rmsnorm eps |
| 1451 | + float f_norm_group_eps = 0.0f; // group norm eps |
| 1452 | + uint32_t n_norm_groups = 0; // group norm groups |
| 1453 | + float f_clamp_kqv = 0.0f; // clamp kqv |
| 1454 | + float f_max_alibi_bias = 0.0f; // max alibi bias |
| 1455 | + float f_logit_scale = 0.0f; // logit scale |
| 1456 | + float f_attention_scale = 0.0f; // attention scale |
| 1457 | + float f_embedding_scale = 0.0f; // embedding scale |
| 1458 | + float f_residual_scale = 0.0f; // residual scale |
| 1459 | + float f_attn_logit_softcapping = 0.0f; // attention logit softcapping |
| 1460 | + float f_final_logit_softcapping = 0.0f; // final logit softcapping |
| 1461 | + float n_attn_temp_floor_scale = 0.0f; // attention temperature floor scale |
| 1462 | + float f_attn_temp_scale = 0.0f; // attention temperature scale |
| 1463 | + float expert_weights_scale = 0.0f; // expert weights scale |
| 1464 | + |
| 1465 | + bool use_par_res = false; // parallel residual |
| 1466 | + bool causal_attn = true; // causal attention |
| 1467 | + bool rope_finetuned = false; // rope finetuned |
| 1468 | + bool swin_norm = false; // swin norm |
| 1469 | + bool attn_soft_cap = false; // attention soft capping |
| 1470 | + bool ssm_dt_b_c_rms = false; // ssm dt_b_c_rms |
| 1471 | + bool use_kq_norm = true; // use kq norm |
| 1472 | + bool expert_weights_norm = false; // expert weights norm |
| 1473 | + bool use_alibi = false; // use ALiBi |
| 1474 | + bool vocab_only = false; // only load vocabulary |
| 1475 | + |
| 1476 | + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; |
| 1477 | + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; |
| 1478 | + llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; // C++ enum, keyword 'enum' omitted |
| 1479 | + llama_expert_gating_func_type expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; // C++ enum, keyword 'enum' omitted |
| 1480 | + |
| 1481 | + float rope_freq_base_train = 0.0f; |
| 1482 | + float rope_freq_scale_train = 0.0f; |
| 1483 | + float rope_freq_base_train_swa = 0.0f; |
| 1484 | + float rope_freq_scale_train_swa = 0.0f; |
| 1485 | + float rope_attn_factor = 0.0f; |
| 1486 | + float rope_yarn_log_mul = 0.0f; |
| 1487 | + uint32_t n_ctx_orig_yarn = 0; |
| 1488 | + llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; |
| 1489 | + |
| 1490 | + llama_token dec_start_token_id = -1; // decoder start token id |
| 1491 | + |
| 1492 | + std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr; |
| 1493 | + std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; |
| 1494 | + std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; |
| 1495 | + std::array<uint32_t, LLAMA_MAX_LAYERS> rope_sections; |
| 1496 | + std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers; |
| 1497 | + std::vector<std::string> layers_block_type_arr; |
| 1498 | + |
| 1499 | + |
| 1500 | + // posnet / convnext hparams |
| 1501 | + struct { |
| 1502 | + uint32_t n_embd = 0; |
| 1503 | + uint32_t n_layer = 0; |
| 1504 | + } posnet; |
| 1505 | + |
| 1506 | + struct { |
| 1507 | + uint32_t n_embd = 0; |
| 1508 | + uint32_t n_layer = 0; |
| 1509 | + } convnext; |
| 1510 | + |
| 1511 | + // helper functions |
| 1512 | + uint32_t n_head (uint32_t il = 0) const { return n_head_arr [il % n_head_arr.size()]; } |
| 1513 | + uint32_t n_head_kv (uint32_t il = 0) const { return n_head_kv_arr[il % n_head_kv_arr.size()]; } |
| 1514 | + uint32_t n_ff (uint32_t il = 0) const { return n_ff_arr [il % n_ff_arr.size()]; } |
| 1515 | + uint32_t n_gqa (uint32_t il = 0) const { return n_head(il)/n_head_kv(il); } |
| 1516 | + uint32_t n_embd_k_gqa(uint32_t il = 0) const { return n_embd_head_k * n_head_kv(il); } // dimension of K (w/ GQA) |
| 1517 | + uint32_t n_embd_v_gqa(uint32_t il = 0) const { return n_embd_head_v * n_head_kv(il); } // dimension of V (w/ GQA) |
| 1518 | + uint32_t n_embd_k_s() const; // dimension of recurrent state for K |
| 1519 | + uint32_t n_embd_v_s() const; // dimension of recurrent state for V |
| 1520 | + |
| 1521 | + bool is_swa(uint32_t il) const { |
| 1522 | + return swa_layers[il % swa_layers.size()] != 0; |
| 1523 | + } |
| 1524 | + |
| 1525 | + bool is_swa_any() const { |
| 1526 | + for (uint32_t il = 0; il < n_layer; ++il) { |
| 1527 | + if (is_swa(il)) { |
| 1528 | + return true; |
| 1529 | + } |
| 1530 | + } |
| 1531 | + return false; |
| 1532 | + } |
| 1533 | + |
| 1534 | + void set_swa_pattern(uint32_t pattern) { |
| 1535 | + for (uint32_t il = 0; il < n_layer; ++il) { |
| 1536 | + swa_layers[il] = (il + 1) % pattern == 0 ? 0 : 1; |
| 1537 | + } |
| 1538 | + } |
| 1539 | +}; |
| 1540 | +#endif // __cplusplus |
1376 | 1541 |
|
1377 | 1542 | #endif // LLAMA_H |
0 commit comments