@@ -257,6 +257,7 @@ struct model_variant {
257257 model_variant (llm_arch arch, const std::string & name) : arch(arch), name(name) {
258258 add_kv (LLM_KV_GENERAL_TYPE, " model" );
259259 add_kv (LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name (arch));
260+ add_kv (LLM_KV_GENERAL_NAME, name);
260261 }
261262
262263 model_variant (const model_variant & other) :
@@ -359,7 +360,118 @@ struct model_variant {
359360 // TODO: how to make the variants more modular?
360361 switch (arch) {
361362 case LLM_ARCH_LLAMA:
362- case LLM_ARCH_LLAMA4:
363+ {
364+ variants.push_back (model_variant (arch, " Llama2" ));
365+ model_variant & cur = variants.back ();
366+
367+ n_embd = 16 ;
368+ const uint32_t n_head = 4 ;
369+ const uint32_t n_head_kv = n_head / 2 ;
370+ const uint32_t n_embd_head_k = n_embd / n_head;
371+ const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
372+ const uint32_t n_embd_v_gqa = n_embd_k_gqa;
373+
374+ cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
375+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, (uint32_t ) 4096 );
376+ cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
377+ cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
378+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT, n_head);
379+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
380+ cur.add_kv (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f );
381+ cur.add_kv (LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
382+
383+ add_tokenizer (cur, n_vocab);
384+
385+ cur.add_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
386+
387+ // output
388+ cur.add_tensor (tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
389+ // omitting the actual output tensor to leave it use tok_embd
390+
391+ for (uint32_t i = 0 ; i < n_layer; ++i) {
392+ cur.add_tensor (tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
393+
394+ cur.add_tensor (tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
395+ cur.add_tensor (tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_k_gqa});
396+ cur.add_tensor (tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_v_gqa});
397+ cur.add_tensor (tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
398+
399+ cur.add_tensor (tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
400+
401+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff});
402+ cur.add_tensor (tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd});
403+ cur.add_tensor (tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
404+ }
405+ } break ;
406+ case LLM_ARCH_LLAMA4: // has chunked interleaved sliding-window
407+ {
408+ variants.push_back (model_variant (arch, " Llama4" ));
409+ model_variant & cur = variants.back ();
410+
411+ n_layer = 4 ; // for the swa pattern
412+ n_embd = 16 ;
413+ const uint32_t n_head = 4 ;
414+ const uint32_t n_head_kv = n_head / 2 ;
415+ const uint32_t n_embd_head_k = n_embd / n_head;
416+ const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
417+ const uint32_t n_embd_v_gqa = n_embd_k_gqa;
418+ const uint32_t n_moe_layer_step = 2 ;
419+ const uint32_t n_ff_exp = n_embd * 2 ;
420+ const uint32_t n_expert = 4 ;
421+
422+ cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
423+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, (uint32_t ) 4096 );
424+ cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
425+ cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
426+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT, n_head);
427+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
428+ cur.add_kv (LLM_KV_EXPERT_COUNT, n_expert);
429+ cur.add_kv (LLM_KV_EXPERT_USED_COUNT, (uint32_t ) 2 );
430+ cur.add_kv (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f );
431+ cur.add_kv (LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
432+ cur.add_kv (LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
433+ cur.add_kv (LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
434+ // FIXME: this isn't used because the default is 8192
435+ cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t ) 389 ); // prime number
436+
437+ add_tokenizer (cur, n_vocab);
438+
439+ cur.add_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
440+
441+ // output
442+ cur.add_tensor (tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
443+ // omitting the actual output tensor to leave it use tok_embd
444+
445+ for (uint32_t i = 0 ; i < n_layer; ++i) {
446+ bool is_moe_layer = (i + 1 ) % n_moe_layer_step == 0 ;
447+
448+ cur.add_tensor (tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
449+
450+ cur.add_tensor (tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
451+ cur.add_tensor (tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_k_gqa});
452+ cur.add_tensor (tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_v_gqa});
453+ cur.add_tensor (tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
454+
455+ cur.add_tensor (tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
456+
457+ if (is_moe_layer) {
458+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE_INP, " weight" , i), {n_embd, n_expert});
459+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE_EXPS, " weight" , i), {n_embd, n_ff_exp, n_expert});
460+ cur.add_tensor (tn (LLM_TENSOR_FFN_DOWN_EXPS, " weight" , i), { n_ff_exp, n_embd, n_expert});
461+ cur.add_tensor (tn (LLM_TENSOR_FFN_UP_EXPS, " weight" , i), {n_embd, n_ff_exp, n_expert});
462+
463+ // Shared expert
464+ const int64_t n_ff_shexp = n_ff_exp;
465+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE_SHEXP, " weight" , i), { n_embd, n_ff_shexp});
466+ cur.add_tensor (tn (LLM_TENSOR_FFN_DOWN_SHEXP, " weight" , i), {n_ff_shexp, n_embd });
467+ cur.add_tensor (tn (LLM_TENSOR_FFN_UP_SHEXP, " weight" , i), { n_embd, n_ff_shexp});
468+ } else {
469+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff});
470+ cur.add_tensor (tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd});
471+ cur.add_tensor (tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
472+ }
473+ }
474+ } break ;
363475 case LLM_ARCH_DECI:
364476 case LLM_ARCH_FALCON:
365477 case LLM_ARCH_BAICHUAN:
@@ -392,7 +504,54 @@ struct model_variant {
392504 case LLM_ARCH_MINICPM:
393505 case LLM_ARCH_MINICPM3:
394506 case LLM_ARCH_GEMMA:
395- case LLM_ARCH_GEMMA2:
507+ break ;
508+ case LLM_ARCH_GEMMA2: // has standard interleaved sliding-window
509+ {
510+ variants.push_back (model_variant (arch, " Gemma2" ));
511+ model_variant & cur = variants.back ();
512+
513+ n_layer = 2 ; // minimum for the swa pattern
514+ n_embd = 16 ;
515+ const uint32_t n_head = 4 ;
516+ const uint32_t n_head_kv = n_head / 2 ;
517+ const uint32_t n_embd_head_k = n_embd / n_head;
518+ const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
519+ const uint32_t n_embd_v_gqa = n_embd_k_gqa;
520+
521+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, (uint32_t ) 4096 );
522+ cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
523+ cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
524+ cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
525+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT, n_head);
526+ cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
527+ cur.add_kv (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f );
528+ cur.add_kv (LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50 .0f );
529+ cur.add_kv (LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30 .0f );
530+ cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t ) 389 ); // prime number
531+
532+ add_tokenizer (cur, n_vocab);
533+
534+ cur.add_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
535+
536+ // output
537+ cur.add_tensor (tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
538+
539+ for (uint32_t i = 0 ; i < n_layer; ++i) {
540+ cur.add_tensor (tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
541+
542+ cur.add_tensor (tn (LLM_TENSOR_ATTN_Q, " weight" , i), {n_embd, n_embd_head_k * n_head});
543+ cur.add_tensor (tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_k_gqa});
544+ cur.add_tensor (tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_v_gqa});
545+ cur.add_tensor (tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd_head_k * n_head, n_embd});
546+ cur.add_tensor (tn (LLM_TENSOR_ATTN_POST_NORM, " weight" , i), {n_embd});
547+
548+ cur.add_tensor (tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
549+ cur.add_tensor (tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff});
550+ cur.add_tensor (tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
551+ cur.add_tensor (tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd});
552+ cur.add_tensor (tn (LLM_TENSOR_FFN_POST_NORM, " weight" , i), {n_embd});
553+ }
554+ } break ;
396555 case LLM_ARCH_GEMMA3:
397556 case LLM_ARCH_STARCODER2:
398557 break ;
@@ -679,7 +838,7 @@ struct reference_logits {
679838 // fprintf(stderr, "Potential error in seq_id %i starting from pos %i\n", seq_id, first_pos_error);
680839 }
681840
682- const float denom = std::sqrt (sumr2 * sumo2);
841+ const float denom = std::sqrt (sumr2) * std::sqrt ( sumo2);
683842
684843 return sumerr2 / (denom > 0 .0f ? denom : 1 .0f );
685844 }
@@ -767,8 +926,8 @@ int main(int argc, char ** argv) {
767926 std::mt19937 rng (42 );
768927
769928 // TODO: multiple sequences per token
770- const int32_t n_batch = 2048 ;
771- const int32_t n_seq_len = 1024 ;
929+ const int32_t n_batch = 3 * 512 ;
930+ const int32_t n_seq_len = 643 ; // prime number
772931
773932 llama_batch batch = llama_batch_init (n_batch, 0 , 1 );
774933 // TODO: batch with embeddings
@@ -798,7 +957,7 @@ int main(int argc, char ** argv) {
798957 // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
799958 // const auto n_embd = llama_model_n_embd(model);
800959
801- for (int32_t n_seq_max : { 1 , 2 , 13 } ) {
960+ for (int32_t n_seq_max : { 1 , 2 , 5 } ) {
802961
803962 // TODO(later): context shift testing
804963 for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
@@ -811,8 +970,6 @@ int main(int argc, char ** argv) {
811970 ref_params.n_ubatch = 1 ;
812971 ref_params.n_ctx = n_seq_len;
813972 ref_params.n_seq_max = 1 ;
814- ref_params.n_threads = 1 ;
815- ref_params.n_threads_batch = 1 ;
816973
817974 llama_context * ref_ctx = llama_init_from_model (model, ref_params);
818975
@@ -828,6 +985,13 @@ int main(int argc, char ** argv) {
828985
829986 for (bool shuffle : { false , true }) {
830987
988+ // skip shuffling the batch for non-recurrent models
989+ // (simple splits don't handle shuffled batches correctly)
990+ // FIXME: remove this
991+ if (shuffle && !llama_model_is_recurrent (model)) {
992+ continue ;
993+ }
994+
831995 for (int32_t n_ubatch : { 1 , 2 , 512 } ) {
832996
833997 std::vector<bool > valid (n_seq_max, true );
@@ -881,8 +1045,6 @@ int main(int argc, char ** argv) {
8811045
8821046 GGML_ASSERT (n_seq_max <= n_batch); // not handling splitting this across batches here
8831047
884- // TODO: use seq_rm, seq_cp, etc. to test if they work properly
885-
8861048 // cont batching
8871049 for (llama_seq_id s : seq_ids_in_batch) {
8881050 llama_pos & pos = seq_id_n_past[s];
@@ -909,6 +1071,10 @@ int main(int argc, char ** argv) {
9091071 exit (1 );
9101072 }
9111073
1074+ // TODO: use seq_rm, seq_cp, etc. to test if they work properly
1075+
1076+ // TODO: test pooled embeddings
1077+
9121078 llama_free (ctx);
9131079 }
9141080 }
0 commit comments