@@ -22,76 +22,70 @@ python3 sgemm.py
2222输出:
2323
2424``` bash
25- -------------------------------------------------------------------------------------
26- M=2048, N=2048, K=1024
27- out_f32: [-23.44512749, 105.22006226, -72.40318298], time:2.581863ms
28- out_f32(sk): [-23.44512749, 105.22006226, -72.40318298], time:1.837885ms
29- out_f32x4(t8x8sk): [-23.44512749, 105.22006226, -72.40318298], time:0.325584ms
30- out_f32x4(t8x8bcf): [-23.44512749, 105.22006226, -72.40318298], time:0.298755ms
31- out_f32x4(t8x8dbuf): [-23.44512749, 105.22006226, -72.40318298], time:0.229251ms
32- out_f32_th: [-23.44515038, 105.22006226, -72.40312958], time:0.255888ms
33- -------------------------------------------------------------------------------------
34- -------------------------------------------------------------------------------------
35- M=2048, N=2048, K=2048
36- out_f32: [4.73375559, -2.49913216, 111.71539307], time:5.155475ms
37- out_f32(sk): [4.73375559, -2.49913216, 111.71539307], time:3.653073ms
38- out_f32x4(t8x8sk): [4.73375559, -2.49913216, 111.71539307], time:0.635004ms
39- out_f32x4(t8x8bcf): [4.73375559, -2.49913216, 111.71539307], time:0.593204ms
40- out_f32x4(t8x8dbuf): [4.73375559, -2.49913216, 111.71539307], time:0.460200ms
41- out_f32_th: [4.73375702, -2.49916267, 111.71534729], time:0.467465ms
42- -------------------------------------------------------------------------------------
43- -------------------------------------------------------------------------------------
44- M=2048, N=4096, K=1024
45- out_f32: [27.58790588, 18.39359474, -23.69882774], time:5.127516ms
46- out_f32(sk): [27.58790588, 18.39359474, -23.69882774], time:3.652875ms
47- out_f32x4(t8x8sk): [27.58790588, 18.39359474, -23.69882774], time:0.626333ms
48- out_f32x4(t8x8bcf): [27.58790588, 18.39359474, -23.69882774], time:0.549185ms
49- out_f32x4(t8x8dbuf): [27.58790588, 18.39359474, -23.69882774], time:0.463538ms
50- out_f32_th: [27.58790588, 18.39359474, -23.69882774], time:0.555634ms
51- -------------------------------------------------------------------------------------
52- -------------------------------------------------------------------------------------
53- M=2048, N=4096, K=2048
54- out_f32: [54.19274139, -0.29313943, 26.92167664], time:10.221355ms
55- out_f32(sk): [54.19274139, -0.29313943, 26.92167664], time:7.268925ms
56- out_f32x4(t8x8sk): [54.19274139, -0.29313943, 26.92167664], time:1.249781ms
57- out_f32x4(t8x8bcf): [54.19274139, -0.29313943, 26.92167664], time:1.119103ms
58- out_f32x4(t8x8dbuf): [54.19274139, -0.29313943, 26.92167664], time:0.960808ms
59- out_f32_th: [54.19275284, -0.29314613, 26.92167473], time:0.920537ms
60- -------------------------------------------------------------------------------------
61- -------------------------------------------------------------------------------------
62- M=4096, N=2048, K=1024
63- out_f32: [-37.67934418, 12.49935532, 40.71273804], time:5.120614ms
64- out_f32(sk): [-37.67934418, 12.49935532, 40.71273804], time:3.652627ms
65- out_f32x4(t8x8sk): [-37.67934418, 12.49935532, 40.71273804], time:0.624588ms
66- out_f32x4(t8x8bcf): [-37.67934418, 12.49935532, 40.71273804], time:0.545461ms
67- out_f32x4(t8x8dbuf): [-37.67934418, 12.49935532, 40.71273804], time:0.462778ms
68- out_f32_th: [-37.67934418, 12.49935532, 40.71273804], time:0.560777ms
69- -------------------------------------------------------------------------------------
70- -------------------------------------------------------------------------------------
71- M=4096, N=2048, K=2048
72- out_f32: [-15.01755524, -0.44903478, 72.23948669], time:10.213506ms
73- out_f32(sk): [-15.01755524, -0.44903478, 72.23948669], time:7.269592ms
74- out_f32x4(t8x8sk): [-15.01755524, -0.44903478, 72.23948669], time:1.242898ms
75- out_f32x4(t8x8bcf): [-15.01755524, -0.44903478, 72.23948669], time:1.099443ms
76- out_f32x4(t8x8dbuf): [-15.01755524, -0.44903478, 72.23948669], time:0.941424ms
77- out_f32_th: [-15.01752663, -0.44904327, 72.23952484], time:0.940223ms
78- -------------------------------------------------------------------------------------
79- -------------------------------------------------------------------------------------
80- M=4096, N=4096, K=1024
81- out_f32: [-5.76778412, 22.12718964, 17.76623344], time:10.221822ms
82- out_f32(sk): [-5.76778412, 22.12718964, 17.76623344], time:7.308133ms
83- out_f32x4(t8x8sk): [-5.76778412, 22.12718964, 17.76623344], time:1.263077ms
84- out_f32x4(t8x8bcf): [-5.76778412, 22.12718964, 17.76623344], time:1.134577ms
85- out_f32x4(t8x8dbuf): [-5.76778412, 22.12718964, 17.76623344], time:1.009488ms
86- out_f32_th: [-5.76778412, 22.12718964, 17.76623344], time:0.926571ms
87- -------------------------------------------------------------------------------------
88- -------------------------------------------------------------------------------------
89- M=4096, N=4096, K=2048
90- out_f32: [35.152565, 56.02351761, 29.87486458], time:20.362103ms
91- out_f32(sk): [35.152565, 56.02351761, 29.87486458], time:14.596984ms
92- out_f32x4(t8x8sk): [35.152565, 56.02351761, 29.87486458], time:2.558391ms
93- out_f32x4(t8x8bcf): [35.152565, 56.02351761, 29.87486458], time:2.313538ms
94- out_f32x4(t8x8dbuf): [35.152565, 56.02351761, 29.87486458], time:2.144170ms
95- out_f32_th: [35.152565, 56.02351761, 29.87486458], time:1.896987ms
96- -------------------------------------------------------------------------------------
25+ ----------------------------------------------------------------------------------------------------
26+ M=2048, N=2048, K=1024
27+ out_f32: [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:2.583222ms
28+ out_f32(sk): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:1.836123ms
29+ out_f32x4(t8x8sk): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:0.324936ms
30+ out_f32x4(t8x8bcf): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:0.290537ms
31+ out_f32x4(t8x8bcf+offset): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:0.289106ms
32+ out_f32x4(t8x8dbuf): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:0.229044ms
33+ out_f32x4(t8x8dbuf+offset): [' -41.69404602' , ' -15.22974205' , ' 12.31010342 ' ], time:0.230970ms
34+ out_f32_th: [' -41.69403076' , ' -15.229743 ' , ' 12.31009007 ' ], time:0.255721ms
35+ ----------------------------------------------------------------------------------------------------
36+ ----------------------------------------------------------------------------------------------------
37+ M=2048, N=2048, K=2048
38+ out_f32: [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:5.152175ms
39+ out_f32(sk): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:3.652353ms
40+ out_f32x4(t8x8sk): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:0.639246ms
41+ out_f32x4(t8x8bcf): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:0.576742ms
42+ out_f32x4(t8x8bcf+offset): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:0.575581ms
43+ out_f32x4(t8x8dbuf): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:0.460470ms
44+ out_f32x4(t8x8dbuf+offset): [' -11.50634861' , ' -30.57016182' , ' 14.03067684 ' ], time:0.465369ms
45+ out_f32_th: [' -11.50632 ' , ' -30.57013321' , ' 14.03067398 ' ], time:0.465064ms
46+ ----------------------------------------------------------------------------------------------------
47+ ----------------------------------------------------------------------------------------------------
48+ M=2048, N=4096, K=1024
49+ out_f32: [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:5.122924ms
50+ out_f32(sk): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:3.653028ms
51+ out_f32x4(t8x8sk): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.625312ms
52+ out_f32x4(t8x8bcf): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.534370ms
53+ out_f32x4(t8x8bcf+offset): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.530348ms
54+ out_f32x4(t8x8dbuf): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.462132ms
55+ out_f32x4(t8x8dbuf+offset): [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.464492ms
56+ out_f32_th: [' 35.35253143 ' , ' 44.40952682 ' , ' -10.71832466' ], time:0.557373ms
57+ ----------------------------------------------------------------------------------------------------
58+ ----------------------------------------------------------------------------------------------------
59+ M=2048, N=4096, K=2048
60+ out_f32: [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:10.218813ms
61+ out_f32(sk): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:7.268655ms
62+ out_f32x4(t8x8sk): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:1.237755ms
63+ out_f32x4(t8x8bcf): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:1.065564ms
64+ out_f32x4(t8x8bcf+offset): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:1.053824ms
65+ out_f32x4(t8x8dbuf): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:0.935848ms
66+ out_f32x4(t8x8dbuf+offset): [' 61.41757584 ' , ' 107.04826355' , ' 37.28448868 ' ], time:0.967648ms
67+ out_f32_th: [' 61.41755676 ' , ' 107.04829407' , ' 37.28450775 ' ], time:0.921094ms
68+ ----------------------------------------------------------------------------------------------------
69+ ----------------------------------------------------------------------------------------------------
70+ M=4096, N=2048, K=1024
71+ out_f32: [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:5.120900ms
72+ out_f32(sk): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:3.651984ms
73+ out_f32x4(t8x8sk): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.622756ms
74+ out_f32x4(t8x8bcf): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.526509ms
75+ out_f32x4(t8x8bcf+offset): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.529506ms
76+ out_f32x4(t8x8dbuf): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.451362ms
77+ out_f32x4(t8x8dbuf+offset): [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.462964ms
78+ out_f32_th: [' 69.17631531 ' , ' 2.35151434 ' , ' 14.92191601 ' ], time:0.552487ms
79+ ----------------------------------------------------------------------------------------------------
80+ ----------------------------------------------------------------------------------------------------
81+ M=4096, N=2048, K=2048
82+ out_f32: [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:10.213661ms
83+ out_f32(sk): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:7.267971ms
84+ out_f32x4(t8x8sk): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:1.244769ms
85+ out_f32x4(t8x8bcf): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:1.076307ms
86+ out_f32x4(t8x8bcf+offset): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:1.074743ms
87+ out_f32x4(t8x8dbuf): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:0.948534ms
88+ out_f32x4(t8x8dbuf+offset): [' 62.51137161 ' , ' -45.17026138' , ' 61.54212952 ' ], time:0.963700ms
89+ out_f32_th: [' 62.51136398 ' , ' -45.17026138' , ' 61.54217911 ' ], time:0.916274ms
90+ ----------------------------------------------------------------------------------------------------
9791```
0 commit comments