Skip to content

Commit 362cf54

Browse files
committed
test-model-random : configurable model n_ctx, and smaller seq lengths
1 parent 6b38c7a commit 362cf54

File tree

2 files changed

+13
-12
lines changed

2 files changed

+13
-12
lines changed

src/llama-model.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
598598
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
599599

600600
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
601-
hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
601+
hparams.n_swa = 8192; // currently it's the same for Scout and Maverick
602602
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
603603

604+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); // for overrides in tests
605+
604606
switch (hparams.n_expert) {
605607
case 16: type = LLM_TYPE_17B_16E; break;
606608
case 128: type = LLM_TYPE_17B_128E; break;

tests/test-model-random.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ struct model_variant {
340340
return status;
341341
}
342342

343-
static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch) {
343+
static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch, uint32_t n_ctx) {
344344
uint32_t n_vocab = 256;
345345
uint32_t n_embd = 32;
346346
uint32_t n_ff = 3 * n_embd;
@@ -391,7 +391,7 @@ struct model_variant {
391391
const uint32_t n_embd_v_gqa = n_embd_k_gqa;
392392

393393
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
394-
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
394+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
395395
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
396396
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
397397
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -439,7 +439,7 @@ struct model_variant {
439439
const uint32_t n_expert = 4;
440440

441441
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
442-
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
442+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
443443
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
444444
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
445445
cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -450,8 +450,7 @@ struct model_variant {
450450
cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
451451
cur.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
452452
cur.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
453-
// FIXME: this isn't used because the default is 8192
454-
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
453+
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2); // TODO: use a prime number
455454

456455
add_tokenizer(cur, n_vocab);
457456

@@ -538,7 +537,7 @@ struct model_variant {
538537
const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
539538
const uint32_t n_embd_v_gqa = n_embd_k_gqa;
540539

541-
cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
540+
cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
542541
cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
543542
cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
544543
cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
@@ -547,7 +546,7 @@ struct model_variant {
547546
cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
548547
cur.add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50.0f);
549548
cur.add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30.0f);
550-
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
549+
cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2); // TODO: use a prime number
551550

552551
add_tokenizer(cur, n_vocab);
553552

@@ -1063,8 +1062,8 @@ int main(int argc, char ** argv) {
10631062
std::mt19937 rng(42);
10641063

10651064
// TODO: multiple sequences per token
1066-
const int32_t n_batch = 3 * 512;
1067-
const int32_t n_seq_len = 643; // prime number
1065+
const int32_t n_batch = 509; // prime number
1066+
const int32_t n_seq_len = 127; // prime number
10681067

10691068
llama_batch batch = llama_batch_init(n_batch, 0, 1);
10701069
// TODO: batch with embeddings
@@ -1073,7 +1072,7 @@ int main(int argc, char ** argv) {
10731072

10741073
for (int i = 0; i < LLM_ARCH_UNKNOWN; ++i) {
10751074
llm_arch arch = (llm_arch) i;
1076-
model_variant::insert_from_arch(model_variants, arch);
1075+
model_variant::insert_from_arch(model_variants, arch, n_seq_len);
10771076
}
10781077

10791078
// TODO: concurrent tests?
@@ -1094,7 +1093,7 @@ int main(int argc, char ** argv) {
10941093
// const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
10951094
// const auto n_embd = llama_model_n_embd(model);
10961095

1097-
for (int32_t n_seq_max : { 1, 2, 5 } ) {
1096+
for (int32_t n_seq_max : { 1, 2, 5, 13 } ) {
10981097

10991098
// TODO(later): context shift testing
11001099
for (int32_t n_ctx : { n_seq_len * n_seq_max }) {

0 commit comments

Comments
 (0)