Skip to content

Commit dfa3c18

Browse files
committed
tests : add LLAMA, LLAMA4, and GEMMA2 to test-model-random
1 parent 61f6429 commit dfa3c18

File tree

2 files changed

+177
-11
lines changed

2 files changed

+177
-11
lines changed

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
145145
llama_build_and_test(test-grammar-integration.cpp)
146146
llama_build_and_test(test-llama-grammar.cpp)
147147
llama_build_and_test(test-chat.cpp)
148+
llama_build_and_test(test-model-random.cpp)
148149
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
149150
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
150151
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
@@ -193,7 +194,6 @@ endif()
193194
# llama_build_and_test(test-opt.cpp) # SLOW
194195
llama_build_and_test(test-gguf.cpp)
195196
llama_build_and_test(test-backend-ops.cpp)
196-
llama_build_and_test(test-model-random.cpp)
197197

198198
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
199199
llama_build_and_test(test-autorelease.cpp LABEL "model")

tests/test-model-random.cpp

Lines changed: 176 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ struct model_variant {
257257
model_variant(llm_arch arch, const std::string & name) : arch(arch), name(name) {
258258
add_kv(LLM_KV_GENERAL_TYPE, "model");
259259
add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
260+
add_kv(LLM_KV_GENERAL_NAME, name);
260261
}
261262

262263
model_variant(const model_variant & other) :
@@ -359,7 +360,118 @@ struct model_variant {
359360
// TODO: how to make the variants more modular?
360361
switch (arch) {
361362
case LLM_ARCH_LLAMA:
362-
case LLM_ARCH_LLAMA4:
363+
{
364+
variants.push_back(model_variant(arch, "Llama2"));
365+
model_variant & cur = variants.back();
366+
367+
n_embd = 16;
368+
const uint32_t n_head = 4;
369+
const uint32_t n_head_kv = n_head / 2;
370+
const uint32_t n_embd_head_k = n_embd / n_head;
371+
const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
372+
const uint32_t n_embd_v_gqa = n_embd_k_gqa;
373+
374+
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
375+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
376+
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
377+
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
378+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
379+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
380+
cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
381+
cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
382+
383+
add_tokenizer(cur, n_vocab);
384+
385+
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
386+
387+
// output
388+
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
389+
// omitting the actual output tensor to leave it use tok_embd
390+
391+
for (uint32_t i = 0; i < n_layer; ++i) {
392+
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
393+
394+
cur.add_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
395+
cur.add_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
396+
cur.add_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
397+
cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
398+
399+
cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
400+
401+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
402+
cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
403+
cur.add_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
404+
}
405+
} break;
406+
case LLM_ARCH_LLAMA4: // has chunked interleaved sliding-window
407+
{
408+
variants.push_back(model_variant(arch, "Llama4"));
409+
model_variant & cur = variants.back();
410+
411+
n_layer = 4; // for the swa pattern
412+
n_embd = 16;
413+
const uint32_t n_head = 4;
414+
const uint32_t n_head_kv = n_head / 2;
415+
const uint32_t n_embd_head_k = n_embd / n_head;
416+
const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
417+
const uint32_t n_embd_v_gqa = n_embd_k_gqa;
418+
const uint32_t n_moe_layer_step = 2;
419+
const uint32_t n_ff_exp = n_embd * 2;
420+
const uint32_t n_expert = 4;
421+
422+
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
423+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
424+
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
425+
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
426+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
427+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
428+
cur.add_kv(LLM_KV_EXPERT_COUNT, n_expert);
429+
cur.add_kv(LLM_KV_EXPERT_USED_COUNT, (uint32_t) 2);
430+
cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
431+
cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
432+
cur.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
433+
cur.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
434+
// FIXME: this isn't used because the default is 8192
435+
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
436+
437+
add_tokenizer(cur, n_vocab);
438+
439+
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
440+
441+
// output
442+
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
443+
// omitting the actual output tensor to leave it use tok_embd
444+
445+
for (uint32_t i = 0; i < n_layer; ++i) {
446+
bool is_moe_layer = (i + 1) % n_moe_layer_step == 0;
447+
448+
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
449+
450+
cur.add_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
451+
cur.add_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
452+
cur.add_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
453+
cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
454+
455+
cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
456+
457+
if (is_moe_layer) {
458+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
459+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert});
460+
cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert});
461+
cur.add_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert});
462+
463+
// Shared expert
464+
const int64_t n_ff_shexp = n_ff_exp;
465+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp});
466+
cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd });
467+
cur.add_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp});
468+
} else {
469+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
470+
cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
471+
cur.add_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
472+
}
473+
}
474+
} break;
363475
case LLM_ARCH_DECI:
364476
case LLM_ARCH_FALCON:
365477
case LLM_ARCH_BAICHUAN:
@@ -392,7 +504,54 @@ struct model_variant {
392504
case LLM_ARCH_MINICPM:
393505
case LLM_ARCH_MINICPM3:
394506
case LLM_ARCH_GEMMA:
395-
case LLM_ARCH_GEMMA2:
507+
break;
508+
case LLM_ARCH_GEMMA2: // has standard interleaved sliding-window
509+
{
510+
variants.push_back(model_variant(arch, "Gemma2"));
511+
model_variant & cur = variants.back();
512+
513+
n_layer = 2; // minimum for the swa pattern
514+
n_embd = 16;
515+
const uint32_t n_head = 4;
516+
const uint32_t n_head_kv = n_head / 2;
517+
const uint32_t n_embd_head_k = n_embd / n_head;
518+
const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
519+
const uint32_t n_embd_v_gqa = n_embd_k_gqa;
520+
521+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
522+
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
523+
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
524+
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
525+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
526+
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
527+
cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
528+
cur.add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50.0f);
529+
cur.add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30.0f);
530+
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
531+
532+
add_tokenizer(cur, n_vocab);
533+
534+
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
535+
536+
// output
537+
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
538+
539+
for (uint32_t i = 0; i < n_layer; ++i) {
540+
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
541+
542+
cur.add_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
543+
cur.add_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
544+
cur.add_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
545+
cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
546+
cur.add_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
547+
548+
cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
549+
cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
550+
cur.add_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
551+
cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
552+
cur.add_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
553+
}
554+
} break;
396555
case LLM_ARCH_GEMMA3:
397556
case LLM_ARCH_STARCODER2:
398557
break;
@@ -679,7 +838,7 @@ struct reference_logits {
679838
// fprintf(stderr, "Potential error in seq_id %i starting from pos %i\n", seq_id, first_pos_error);
680839
}
681840

682-
const float denom = std::sqrt(sumr2 * sumo2);
841+
const float denom = std::sqrt(sumr2) * std::sqrt(sumo2);
683842

684843
return sumerr2 / (denom > 0.0f ? denom : 1.0f);
685844
}
@@ -767,8 +926,8 @@ int main(int argc, char ** argv) {
767926
std::mt19937 rng(42);
768927

769928
// TODO: multiple sequences per token
770-
const int32_t n_batch = 2048;
771-
const int32_t n_seq_len = 1024;
929+
const int32_t n_batch = 3 * 512;
930+
const int32_t n_seq_len = 643; // prime number
772931

773932
llama_batch batch = llama_batch_init(n_batch, 0, 1);
774933
// TODO: batch with embeddings
@@ -798,7 +957,7 @@ int main(int argc, char ** argv) {
798957
// const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
799958
// const auto n_embd = llama_model_n_embd(model);
800959

801-
for (int32_t n_seq_max : { 1, 2, 13 } ) {
960+
for (int32_t n_seq_max : { 1, 2, 5 } ) {
802961

803962
// TODO(later): context shift testing
804963
for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
@@ -811,8 +970,6 @@ int main(int argc, char ** argv) {
811970
ref_params.n_ubatch = 1;
812971
ref_params.n_ctx = n_seq_len;
813972
ref_params.n_seq_max = 1;
814-
ref_params.n_threads = 1;
815-
ref_params.n_threads_batch = 1;
816973

817974
llama_context * ref_ctx = llama_init_from_model(model, ref_params);
818975

@@ -828,6 +985,13 @@ int main(int argc, char ** argv) {
828985

829986
for (bool shuffle : { false, true }) {
830987

988+
// skip shuffling the batch for non-recurrent models
989+
// (simple splits don't handle shuffled batches correctly)
990+
// FIXME: remove this
991+
if (shuffle && !llama_model_is_recurrent(model)) {
992+
continue;
993+
}
994+
831995
for (int32_t n_ubatch : { 1, 2, 512 } ) {
832996

833997
std::vector<bool> valid(n_seq_max, true);
@@ -881,8 +1045,6 @@ int main(int argc, char ** argv) {
8811045

8821046
GGML_ASSERT(n_seq_max <= n_batch); // not handling splitting this across batches here
8831047

884-
// TODO: use seq_rm, seq_cp, etc. to test if they work properly
885-
8861048
// cont batching
8871049
for (llama_seq_id s : seq_ids_in_batch) {
8881050
llama_pos & pos = seq_id_n_past[s];
@@ -909,6 +1071,10 @@ int main(int argc, char ** argv) {
9091071
exit(1);
9101072
}
9111073

1074+
// TODO: use seq_rm, seq_cp, etc. to test if they work properly
1075+
1076+
// TODO: test pooled embeddings
1077+
9121078
llama_free(ctx);
9131079
}
9141080
}

0 commit comments

Comments
 (0)