10
10
- [X] layer_norm_f16x2_f16_kernel
11
11
- [X] layer_norm_f16x8_f16_kernel
12
12
- [X] layer_norm_f16x8_pack_f16_kernel
13
+ - [X] layer_norm_f16x8_pack_f32_kernel
13
14
- [X] layer_norm_f16_f32_kernel
14
15
- [X] PyTorch bindings
15
16
@@ -27,64 +28,70 @@ python3 layer_norm.py
27
28
-------------------------------------------------------------------------------------
28
29
N=4096, K=512
29
30
-------------------------------------------------------------------------------------
30
- out_f32: [' -1.76292217 ' , ' 0.04765211 ' , ' 0.50859255 ' ], time:0.01897240ms
31
- out_f32x4: [' -1.76292217 ' , ' 0.04765211 ' , ' 0.50859255 ' ], time:0.00600266ms
32
- out_f32_th: [' -1.76119995 ' , ' 0.04760556 ' , ' 0.50809568 ' ], time:0.07085347ms
31
+ out_f32: [' -0.95119929 ' , ' 0.65728813 ' , ' -0.27701864 ' ], time:0.01898599ms
32
+ out_f32x4: [' -0.95119929 ' , ' 0.65728813 ' , ' -0.27701864 ' ], time:0.00600958ms
33
+ out_f32_th: [' -0.95026982 ' , ' 0.65664589 ' , ' -0.27674797 ' ], time:0.07345414ms
33
34
-------------------------------------------------------------------------------------
34
- out_f16f16: [' -1.76367188 ' , ' 0.04763794 ' , ' 0.50878906 ' ], time:0.01869035ms
35
- out_f16f32: [' -1.76367188 ' , ' 0.04766846 ' , ' 0.50878906 ' ], time:0.01897883ms
36
- out_f16x2f16: [' -1.76367188 ' , ' 0.04766846 ' , ' 0.50878906 ' ], time:0.00951219ms
37
- out_f16x8f16: [' -1.76367188 ' , ' 0.04766846 ' , ' 0.50878906 ' ], time:0.00467825ms
38
- out_f16x8packf16: [' -1.76367188 ' , ' 0.04763794 ' , ' 0.50878906 ' ], time:0.00430202ms
39
- out_f16_th: [' -1.76171875 ' , ' 0.04760742 ' , ' 0.50830078 ' ], time:0.07009959ms
35
+ out_f16f16: [' -0.95068359 ' , ' 0.65722656 ' , ' -0.27709961 ' ], time:0.01866651ms
36
+ out_f16f32: [' -0.95117188 ' , ' 0.65722656 ' , ' -0.27709961 ' ], time:0.01897073ms
37
+ out_f16x2f16: [' -0.95068359 ' , ' 0.65722656 ' , ' -0.27709961 ' ], time:0.00952697ms
38
+ out_f16x8f16: [' -0.95068359 ' , ' 0.65722656 ' , ' -0.27709961 ' ], time:0.00470805ms
39
+ out_f16x8packf16: [' -0.95117188 ' , ' 0.65673828 ' , ' -0.27709961 ' ], time:0.00427437ms
40
+ out_f16x8packf32: [' -0.95117188 ' , ' 0.65722656 ' , ' -0.27709961 ' ], time:0.00418639ms
41
+ out_f16_th: [' -0.94970703 ' , ' 0.65673828 ' , ' -0.27685547 ' ], time:0.07291913ms
40
42
-------------------------------------------------------------------------------------
41
43
-------------------------------------------------------------------------------------
42
44
N=4096, K=1024
43
45
-------------------------------------------------------------------------------------
44
- out_f32: [' -0.65619785 ' , ' 1.33576787 ' , ' -0.29172164 ' ], time:0.05123448ms
45
- out_f32x4: [' -0.65619785 ' , ' 1.33576787 ' , ' -0.29172164 ' ], time:0.01073551ms
46
- out_f32_th: [' -0.65587735 ' , ' 1.33511555 ' , ' -0.29157916 ' ], time:0.07034254ms
46
+ out_f32: [' 0.81839228 ' , ' 0.36616057 ' , ' -1.71588480 ' ], time:0.05122757ms
47
+ out_f32x4: [' 0.81839228 ' , ' 0.36616057 ' , ' -1.71588480 ' ], time:0.01071095ms
48
+ out_f32_th: [' 0.81799269 ' , ' 0.36598179 ' , ' -1.71504688 ' ], time:0.07267237ms
47
49
-------------------------------------------------------------------------------------
48
- out_f16f16: [' -0.65576172 ' , ' 1.3359375 ' , ' -0.29174805 ' ], time:0.05320668ms
49
- out_f16f32: [' -0.65576172 ' , ' 1.3359375 ' , ' -0.29150391 ' ], time:0.05061388ms
50
- out_f16x2f16: [' -0.65576172 ' , ' 1.3359375 ' , ' -0.29174805 ' ], time:0.01861978ms
51
- out_f16x8f16: [' -0.65576172 ' , ' 1.3359375 ' , ' -0.29174805 ' ], time:0.00745845ms
52
- out_f16x8packf16: [' -0.65576172 ' , ' 1.3359375 ' , ' -0.29174805 ' ], time:0.00648832ms
53
- out_f16_th: [' -0.65527344 ' , ' 1.33398438 ' , ' -0.29150391 ' ], time:0.07068610ms
50
+ out_f16f16: [' 0.81835938 ' , ' 0.36596680 ' , ' -1.71484375 ' ], time:0.05317926ms
51
+ out_f16f32: [' 0.81835938 ' , ' 0.36621094 ' , ' -1.71582031 ' ], time:0.05062103ms
52
+ out_f16x2f16: [' 0.81884766 ' , ' 0.36621094 ' , ' -1.71679688 ' ], time:0.01855445ms
53
+ out_f16x8f16: [' 0.81884766 ' , ' 0.36621094 ' , ' -1.71679688 ' ], time:0.00742888ms
54
+ out_f16x8packf16: [' 0.81884766 ' , ' 0.36621094 ' , ' -1.71679688 ' ], time:0.00645399ms
55
+ out_f16x8packf32: [' 0.81835938 ' , ' 0.36621094 ' , ' -1.71582031 ' ], time:0.00634456ms
56
+ out_f16_th: [' 0.81835938 ' , ' 0.36596680 ' , ' -1.71582031 ' ], time:0.07386255ms
54
57
-------------------------------------------------------------------------------------
55
58
-------------------------------------------------------------------------------------
56
59
N=4096, K=2048
57
60
-------------------------------------------------------------------------------------
58
- out_f32x4: [' 0.92044634 ' , ' 0.37421227 ' , ' -2.49094558 ' ], time:0.02202415ms
59
- out_f32_th: [' 0.92022169 ' , ' 0.37412092 ' , ' -2.49033761 ' ], time:0.12026787ms
61
+ out_f32x4: [' -0.65341073 ' , ' 0.10270299 ' , ' -0.06597849 ' ], time:0.02200651ms
62
+ out_f32_th: [' -0.65325129 ' , ' 0.10267793 ' , ' -0.06596238 ' ], time:0.12027287ms
60
63
-------------------------------------------------------------------------------------
61
- out_f16x2f16: [' 0.92041016 ' , ' 0.37426758 ' , ' -2.49023438 ' ], time:0.05346847ms
62
- out_f16x8f16: [' 0.92041016 ' , ' 0.37426758 ' , ' -2.49023438 ' ], time:0.01381087ms
63
- out_f16x8packf16: [' 0.92041016 ' , ' 0.37426758 ' , ' -2.49023438 ' ], time:0.01159072ms
64
- out_f16_th: [' 0.92041016 ' , ' 0.37426758 ' , ' -2.49023438 ' ], time:0.08454061ms
64
+ out_f16x2f16: [' -0.65332031 ' , ' 0.10266113 ' , ' -0.06591797 ' ], time:0.05352354ms
65
+ out_f16x8f16: [' -0.65380859 ' , ' 0.10272217 ' , ' -0.06597900 ' ], time:0.01377678ms
66
+ out_f16x8packf16: [' -0.65332031 ' , ' 0.10266113 ' , ' -0.06591797 ' ], time:0.01154637ms
67
+ out_f16x8packf32: [' -0.65332031 ' , ' 0.10272217 ' , ' -0.06597900 ' ], time:0.01166582ms
68
+ out_f16_th: [' -0.65380859 ' , ' 0.10272217 ' , ' -0.06597900 ' ], time:0.08442783ms
65
69
-------------------------------------------------------------------------------------
66
70
-------------------------------------------------------------------------------------
67
71
N=4096, K=4096
68
72
-------------------------------------------------------------------------------------
69
- out_f32x4: [' -2.05339074 ' , ' 0.25924587 ' , ' 0.42393678 ' ], time:0.18885875ms
70
- out_f32_th: [' -2.05314016 ' , ' 0.25921422 ' , ' 0.42388505 ' ], time:0.77834105ms
73
+ out_f32x4: [' 2.38733387 ' , ' -0.03023042 ' , ' 0.66022825 ' ], time:0.18884635ms
74
+ out_f32_th: [' 2.38704205 ' , ' -0.03022672 ' , ' 0.66014749 ' ], time:0.77852798ms
71
75
-------------------------------------------------------------------------------------
72
- out_f16x8f16: [' -2.05273438 ' , ' 0.2590332 ' , ' 0.42382812 ' ], time:0.03327322ms
73
- out_f16x8packf16: [' -2.05273438 ' , ' 0.2590332 ' , ' 0.42382812 ' ], time:0.02402687ms
74
- out_f16_th: [' -2.05273438 ' , ' 0.2590332 ' , ' 0.42382812 ' ], time:0.17436218ms
76
+ out_f16x8f16: [' 2.38671875 ' , ' -0.03024292 ' , ' 0.66015625 ' ], time:0.03325391ms
77
+ out_f16x8packf16: [' 2.38671875 ' , ' -0.03024292 ' , ' 0.66015625 ' ], time:0.02401376ms
78
+ out_f16x8packf32: [' 2.38671875 ' , ' -0.03021240 ' , ' 0.66064453 ' ], time:0.02381730ms
79
+ out_f16_th: [' 2.38671875 ' , ' -0.03021240 ' , ' 0.66015625 ' ], time:0.17546010ms
75
80
-------------------------------------------------------------------------------------
76
81
-------------------------------------------------------------------------------------
77
82
N=4096, K=8192
78
83
-------------------------------------------------------------------------------------
79
- out_f16x8f16: [' -1.0234375 ' , ' -0.3371582 ' , ' -1.54882812 ' ], time:0.19311237ms
80
- out_f16x8packf16: [' -1.0234375 ' , ' -0.33691406 ' , ' -1.54882812 ' ], time:0.18668032ms
81
- out_f16_th: [' -1.0234375 ' , ' -0.33691406 ' , ' -1.54882812 ' ], time:0.84443021ms
84
+ out_f16x8f16: [' 0.15905762 ' , ' 1.06542969 ' , ' -0.19396973 ' ], time:0.19306803ms
85
+ out_f16x8packf16: [' 0.15905762 ' , ' 1.06542969 ' , ' -0.19396973 ' ], time:0.18665886ms
86
+ out_f16x8packf32: [' 0.15905762 ' , ' 1.06542969 ' , ' -0.19396973 ' ], time:0.18657684ms
87
+ out_f16_th: [' 0.15905762 ' , ' 1.06542969 ' , ' -0.19396973 ' ], time:0.84462571ms
82
88
-------------------------------------------------------------------------------------
83
89
-------------------------------------------------------------------------------------
84
90
N=8192, K=8192
85
91
-------------------------------------------------------------------------------------
86
- out_f16x8f16: [' -1.03320312 ' , ' 0.41455078 ' , ' -0.49707031 ' ], time:0.38361049ms
87
- out_f16x8packf16: [' -1.03320312 ' , ' 0.41455078 ' , ' -0.49707031 ' ], time:0.40809250ms
88
- out_f16_th: [' -1.03320312 ' , ' 0.41455078 ' , ' -0.49707031 ' ], time:1.99517584ms
92
+ out_f16x8f16: [' -0.53662109 ' , ' 2.359375 ' , ' 0.78027344 ' ], time:0.38366604ms
93
+ out_f16x8packf16: [' -0.53662109 ' , ' 2.359375 ' , ' 0.78027344 ' ], time:0.40789628ms
94
+ out_f16x8packf32: [' -0.53613281 ' , ' 2.359375 ' , ' 0.78027344 ' ], time:0.40818143ms
95
+ out_f16_th: [' -0.53662109 ' , ' 2.359375 ' , ' 0.78027344 ' ], time:1.99523735ms
89
96
-------------------------------------------------------------------------------------
90
97
```
0 commit comments