@@ -1536,7 +1536,9 @@ def norm_class(self) -> Type:
1536
1536
name = "CodeLlama-70b-Instruct-hf" ,
1537
1537
hf_config = dict (org = "codellama" , name = "CodeLlama-70b-Instruct-hf" ),
1538
1538
block_size = 16384 ,
1539
- vocab_size = 32016 ,
1539
+ # 32016 is an added token, so not reported in vocab_size
1540
+ # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json
1541
+ vocab_size = 32015 ,
1540
1542
padding_multiple = 16 ,
1541
1543
n_layer = 80 ,
1542
1544
n_head = 64 ,
@@ -2331,53 +2333,6 @@ def norm_class(self) -> Type:
2331
2333
),
2332
2334
]
2333
2335
2334
- qwen_2_5_1m = [
2335
- # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
2336
- dict (
2337
- name = "Qwen2.5-7B-Instruct-1M" ,
2338
- hf_config = dict (org = "Qwen" , name = "Qwen2.5-7B-Instruct-1M" ),
2339
- block_size = 1010000 ,
2340
- vocab_size = 151643 ,
2341
- padded_vocab_size = 152064 ,
2342
- n_layer = 28 ,
2343
- n_head = 28 ,
2344
- n_embd = 3584 ,
2345
- n_query_groups = 4 ,
2346
- rotary_percentage = 1.0 ,
2347
- parallel_residual = False ,
2348
- bias = False ,
2349
- attn_bias = True ,
2350
- norm_class_name = "RMSNorm" ,
2351
- mlp_class_name = "LLaMAMLP" ,
2352
- intermediate_size = 18944 ,
2353
- norm_eps = 1e-5 ,
2354
- rope_base = 10000000 ,
2355
- ),
2356
- # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
2357
- dict (
2358
- name = "Qwen2.5-14B-Instruct-1M" ,
2359
- hf_config = dict (org = "Qwen" , name = "Qwen2.5-14B-Instruct-1M" ),
2360
- block_size = 1010000 ,
2361
- vocab_size = 151643 ,
2362
- padded_vocab_size = 152064 ,
2363
- n_layer = 48 ,
2364
- n_head = 40 ,
2365
- n_embd = 5120 ,
2366
- n_query_groups = 8 ,
2367
- rotary_percentage = 1.0 ,
2368
- parallel_residual = False ,
2369
- bias = False ,
2370
- attn_bias = True ,
2371
- norm_class_name = "RMSNorm" ,
2372
- mlp_class_name = "LLaMAMLP" ,
2373
- intermediate_size = 13824 ,
2374
- norm_eps = 1e-5 ,
2375
- rope_base = 10000000 ,
2376
- ),
2377
- ]
2378
-
2379
- qwen_2_5 .extend (qwen_2_5_1m )
2380
-
2381
2336
qwen_2_5_coder = [
2382
2337
# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
2383
2338
dict (
@@ -2584,6 +2539,53 @@ def norm_class(self) -> Type:
2584
2539
copy ["hf_config" ]["name" ] = c ["hf_config" ]["name" ].format (kind )
2585
2540
configs .append (copy )
2586
2541
2542
+ qwen_2_5_1m = [
2543
+ # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
2544
+ dict (
2545
+ name = "Qwen2.5-7B-Instruct-1M" ,
2546
+ hf_config = dict (org = "Qwen" , name = "Qwen2.5-7B-Instruct-1M" ),
2547
+ block_size = 1010000 ,
2548
+ vocab_size = 151643 ,
2549
+ padded_vocab_size = 152064 ,
2550
+ n_layer = 28 ,
2551
+ n_head = 28 ,
2552
+ n_embd = 3584 ,
2553
+ n_query_groups = 4 ,
2554
+ rotary_percentage = 1.0 ,
2555
+ parallel_residual = False ,
2556
+ bias = False ,
2557
+ attn_bias = True ,
2558
+ norm_class_name = "RMSNorm" ,
2559
+ mlp_class_name = "LLaMAMLP" ,
2560
+ intermediate_size = 18944 ,
2561
+ norm_eps = 1e-5 ,
2562
+ rope_base = 10000000 ,
2563
+ ),
2564
+ # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
2565
+ dict (
2566
+ name = "Qwen2.5-14B-Instruct-1M" ,
2567
+ hf_config = dict (org = "Qwen" , name = "Qwen2.5-14B-Instruct-1M" ),
2568
+ block_size = 1010000 ,
2569
+ vocab_size = 151643 ,
2570
+ padded_vocab_size = 152064 ,
2571
+ n_layer = 48 ,
2572
+ n_head = 40 ,
2573
+ n_embd = 5120 ,
2574
+ n_query_groups = 8 ,
2575
+ rotary_percentage = 1.0 ,
2576
+ parallel_residual = False ,
2577
+ bias = False ,
2578
+ attn_bias = True ,
2579
+ norm_class_name = "RMSNorm" ,
2580
+ mlp_class_name = "LLaMAMLP" ,
2581
+ intermediate_size = 13824 ,
2582
+ norm_eps = 1e-5 ,
2583
+ rope_base = 10000000 ,
2584
+ ),
2585
+ ]
2586
+
2587
+ configs .extend (qwen_2_5_1m )
2588
+
2587
2589
##########
2588
2590
# QwQ
2589
2591
##########
0 commit comments