@@ -340,7 +340,7 @@ struct model_variant {
340340 return status;
341341 }
342342
343- static void insert_from_arch (std::vector<model_variant> & variants, llm_arch arch) {
343+ static void insert_from_arch (std::vector<model_variant> & variants, llm_arch arch, uint32_t n_ctx ) {
344344 uint32_t n_vocab = 256 ;
345345 uint32_t n_embd = 32 ;
346346 uint32_t n_ff = 3 * n_embd;
@@ -391,7 +391,7 @@ struct model_variant {
391391 const uint32_t n_embd_v_gqa = n_embd_k_gqa;
392392
393393 cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
394- cur.add_kv (LLM_KV_CONTEXT_LENGTH, ( uint32_t ) 4096 );
394+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, n_ctx );
395395 cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
396396 cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
397397 cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -439,7 +439,7 @@ struct model_variant {
439439 const uint32_t n_expert = 4 ;
440440
441441 cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
442- cur.add_kv (LLM_KV_CONTEXT_LENGTH, ( uint32_t ) 4096 );
442+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, n_ctx );
443443 cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
444444 cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
445445 cur.add_kv (LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -450,8 +450,7 @@ struct model_variant {
450450 cur.add_kv (LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
451451 cur.add_kv (LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
452452 cur.add_kv (LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
453- // FIXME: this isn't used because the default is 8192
454- cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t ) 389 ); // prime number
453+ cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2 ); // TODO: use a prime number
455454
456455 add_tokenizer (cur, n_vocab);
457456
@@ -538,7 +537,7 @@ struct model_variant {
538537 const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
539538 const uint32_t n_embd_v_gqa = n_embd_k_gqa;
540539
541- cur.add_kv (LLM_KV_CONTEXT_LENGTH, ( uint32_t ) 4096 );
540+ cur.add_kv (LLM_KV_CONTEXT_LENGTH, n_ctx );
542541 cur.add_kv (LLM_KV_EMBEDDING_LENGTH, n_embd);
543542 cur.add_kv (LLM_KV_BLOCK_COUNT, n_layer);
544543 cur.add_kv (LLM_KV_FEED_FORWARD_LENGTH, n_ff);
@@ -547,7 +546,7 @@ struct model_variant {
547546 cur.add_kv (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f );
548547 cur.add_kv (LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50 .0f );
549548 cur.add_kv (LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30 .0f );
550- cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, ( uint32_t ) 389 ); // prime number
549+ cur.add_kv (LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2 ); // TODO: use a prime number
551550
552551 add_tokenizer (cur, n_vocab);
553552
@@ -1063,8 +1062,8 @@ int main(int argc, char ** argv) {
10631062 std::mt19937 rng (42 );
10641063
10651064 // TODO: multiple sequences per token
1066- const int32_t n_batch = 3 * 512 ;
1067- const int32_t n_seq_len = 643 ; // prime number
1065+ const int32_t n_batch = 509 ; // prime number
1066+ const int32_t n_seq_len = 127 ; // prime number
10681067
10691068 llama_batch batch = llama_batch_init (n_batch, 0 , 1 );
10701069 // TODO: batch with embeddings
@@ -1073,7 +1072,7 @@ int main(int argc, char ** argv) {
10731072
10741073 for (int i = 0 ; i < LLM_ARCH_UNKNOWN; ++i) {
10751074 llm_arch arch = (llm_arch) i;
1076- model_variant::insert_from_arch (model_variants, arch);
1075+ model_variant::insert_from_arch (model_variants, arch, n_seq_len );
10771076 }
10781077
10791078 // TODO: concurrent tests?
@@ -1094,7 +1093,7 @@ int main(int argc, char ** argv) {
10941093 // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
10951094 // const auto n_embd = llama_model_n_embd(model);
10961095
1097- for (int32_t n_seq_max : { 1 , 2 , 5 } ) {
1096+ for (int32_t n_seq_max : { 1 , 2 , 5 , 13 } ) {
10981097
10991098 // TODO(later): context shift testing
11001099 for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
0 commit comments