Skip to content

Commit 77b4a8e

Browse files
committed
update_kernel
1 parent 0fec91c commit 77b4a8e

File tree

27 files changed

+287
-93
lines changed

27 files changed

+287
-93
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"4": {
3+
"BK": 128,
4+
"BV": 64,
5+
"num_stages": 4,
6+
"num_warps": 4
7+
}
8+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"4": {
3+
"BK": 128,
4+
"BV": 64,
5+
"num_stages": 2,
6+
"num_warps": 4
7+
}
8+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"4": {
3+
"BK": 64,
4+
"BV": 128,
5+
"num_stages": 3,
6+
"num_warps": 4
7+
}
8+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"4": {
3+
"BV": 32,
4+
"num_stages": 4,
5+
"num_warps": 4
6+
}
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"1": {
3+
"num_warps": 4
4+
},
5+
"100": {
6+
"num_warps": 8
7+
},
8+
"1024": {
9+
"num_warps": 4
10+
},
11+
"128": {
12+
"num_warps": 1
13+
},
14+
"16": {
15+
"num_warps": 4
16+
},
17+
"2048": {
18+
"num_warps": 2
19+
},
20+
"256": {
21+
"num_warps": 1
22+
},
23+
"32": {
24+
"num_warps": 8
25+
},
26+
"4096": {
27+
"num_warps": 2
28+
},
29+
"64": {
30+
"num_warps": 8
31+
},
32+
"8": {
33+
"num_warps": 8
34+
},
35+
"8448": {
36+
"num_warps": 1
37+
}
38+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"4": {
3+
"BK": 64,
4+
"num_stages": 3,
5+
"num_warps": 4
6+
}
7+
}
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,3 @@
11
{
2-
"1": {
3-
"BLK_HEADS": 64,
4-
"num_warps": 2
5-
},
6-
"100": {
7-
"BLK_HEADS": 16,
8-
"num_warps": 2
9-
},
10-
"1024": {
11-
"BLK_HEADS": 8,
12-
"num_warps": 2
13-
},
14-
"128": {
15-
"BLK_HEADS": 64,
16-
"num_warps": 2
17-
},
18-
"16": {
19-
"BLK_HEADS": 16,
20-
"num_warps": 1
21-
},
22-
"256": {
23-
"BLK_HEADS": 16,
24-
"num_warps": 2
25-
},
26-
"32": {
27-
"BLK_HEADS": 16,
28-
"num_warps": 1
29-
},
30-
"64": {
31-
"BLK_HEADS": 8,
32-
"num_warps": 2
33-
},
34-
"8": {
35-
"BLK_HEADS": 64,
36-
"num_warps": 4
37-
}
2+
"8448": null
383
}

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
"BLOCK_N": 256,
88
"num_warps": 1
99
},
10+
"16384": {
11+
"BLOCK_N": 128,
12+
"num_warps": 1
13+
},
1014
"2048": {
1115
"BLOCK_N": 64,
1216
"num_warps": 1
@@ -15,6 +19,10 @@
1519
"BLOCK_N": 256,
1620
"num_warps": 1
1721
},
22+
"32768": {
23+
"BLOCK_N": 256,
24+
"num_warps": 2
25+
},
1826
"512": {
1927
"BLOCK_N": 512,
2028
"num_warps": 4
@@ -23,6 +31,10 @@
2331
"BLOCK_N": 256,
2432
"num_warps": 1
2533
},
34+
"67584": {
35+
"BLOCK_N": 64,
36+
"num_warps": 1
37+
},
2638
"8": {
2739
"BLOCK_N": 512,
2840
"num_warps": 8

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=2048,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"2048": {
3-
"BLOCK_SIZE": 2048,
3+
"BLOCK_SIZE": 4096,
44
"num_stages": 4,
55
"num_warps": 4
66
}

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=128,N=2048,expert_num=512,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@
4444
"num_stages": 2,
4545
"num_warps": 4
4646
},
47+
"20480": {
48+
"BLOCK_SIZE_K": 32,
49+
"BLOCK_SIZE_M": 64,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 32,
52+
"NEED_TRANS": false,
53+
"num_stages": 3,
54+
"num_warps": 4
55+
},
4756
"2560": {
4857
"BLOCK_SIZE_K": 64,
4958
"BLOCK_SIZE_M": 16,
@@ -62,6 +71,15 @@
6271
"num_stages": 3,
6372
"num_warps": 4
6473
},
74+
"40960": {
75+
"BLOCK_SIZE_K": 32,
76+
"BLOCK_SIZE_M": 64,
77+
"BLOCK_SIZE_N": 128,
78+
"GROUP_SIZE_M": 32,
79+
"NEED_TRANS": false,
80+
"num_stages": 3,
81+
"num_warps": 4
82+
},
6583
"640": {
6684
"BLOCK_SIZE_K": 128,
6785
"BLOCK_SIZE_M": 16,
@@ -79,5 +97,14 @@
7997
"NEED_TRANS": false,
8098
"num_stages": 2,
8199
"num_warps": 4
100+
},
101+
"84480": {
102+
"BLOCK_SIZE_K": 32,
103+
"BLOCK_SIZE_M": 64,
104+
"BLOCK_SIZE_N": 128,
105+
"GROUP_SIZE_M": 32,
106+
"NEED_TRANS": false,
107+
"num_stages": 3,
108+
"num_warps": 4
82109
}
83110
}

0 commit comments

Comments
 (0)