@@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
927
927
"jg .label24\n"
928
928
/* multiply the result by alpha */
929
929
"vbroadcastsd (%[alpha]), %%zmm9\n"
930
- "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
931
- "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
932
- "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
933
- "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
934
- "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
935
- "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
936
- "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
937
- "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
938
- "vmulpd %%zmm9, %%zmm11, %%zmm11\n"
939
- "vmulpd %%zmm9, %%zmm12, %%zmm12\n"
940
- "vmulpd %%zmm9, %%zmm13, %%zmm13\n"
941
- "vmulpd %%zmm9, %%zmm14, %%zmm14\n"
942
- "vmulpd %%zmm9, %%zmm15, %%zmm15\n"
943
- "vmulpd %%zmm9, %%zmm16, %%zmm16\n"
944
- "vmulpd %%zmm9, %%zmm17, %%zmm17\n"
945
- "vmulpd %%zmm9, %%zmm18, %%zmm18\n"
946
- "vmulpd %%zmm9, %%zmm21, %%zmm21\n"
947
- "vmulpd %%zmm9, %%zmm22, %%zmm22\n"
948
- "vmulpd %%zmm9, %%zmm23, %%zmm23\n"
949
- "vmulpd %%zmm9, %%zmm24, %%zmm24\n"
950
- "vmulpd %%zmm9, %%zmm25, %%zmm25\n"
951
- "vmulpd %%zmm9, %%zmm26, %%zmm26\n"
952
- "vmulpd %%zmm9, %%zmm27, %%zmm27\n"
953
- "vmulpd %%zmm9, %%zmm28, %%zmm28\n"
954
930
/* And store additively in C */
955
- "vaddpd (%[C0]), %%zmm1 , %%zmm1\n"
956
- "vaddpd (%[C1]), %%zmm2 , %%zmm2\n"
957
- "vaddpd (%[C2]), %%zmm3 , %%zmm3\n"
958
- "vaddpd (%[C3]), %%zmm4 , %%zmm4\n"
959
- "vaddpd (%[C4]), %%zmm5 , %%zmm5\n"
960
- "vaddpd (%[C5]), %%zmm6 , %%zmm6\n"
961
- "vaddpd (%[C6]), %%zmm7 , %%zmm7\n"
962
- "vaddpd (%[C7]), %%zmm8 , %%zmm8\n"
931
+ "vfmadd213pd (%[C0]), %%zmm9 , %%zmm1\n"
932
+ "vfmadd213pd (%[C1]), %%zmm9 , %%zmm2\n"
933
+ "vfmadd213pd (%[C2]), %%zmm9 , %%zmm3\n"
934
+ "vfmadd213pd (%[C3]), %%zmm9 , %%zmm4\n"
935
+ "vfmadd213pd (%[C4]), %%zmm9 , %%zmm5\n"
936
+ "vfmadd213pd (%[C5]), %%zmm9 , %%zmm6\n"
937
+ "vfmadd213pd (%[C6]), %%zmm9 , %%zmm7\n"
938
+ "vfmadd213pd (%[C7]), %%zmm9 , %%zmm8\n"
963
939
"vmovupd %%zmm1, (%[C0])\n"
964
940
"vmovupd %%zmm2, (%[C1])\n"
965
941
"vmovupd %%zmm3, (%[C2])\n"
@@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
969
945
"vmovupd %%zmm7, (%[C6])\n"
970
946
"vmovupd %%zmm8, (%[C7])\n"
971
947
972
- "vaddpd 64(%[C0]), %%zmm11 , %%zmm11\n"
973
- "vaddpd 64(%[C1]), %%zmm12 , %%zmm12\n"
974
- "vaddpd 64(%[C2]), %%zmm13 , %%zmm13\n"
975
- "vaddpd 64(%[C3]), %%zmm14 , %%zmm14\n"
976
- "vaddpd 64(%[C4]), %%zmm15 , %%zmm15\n"
977
- "vaddpd 64(%[C5]), %%zmm16 , %%zmm16\n"
978
- "vaddpd 64(%[C6]), %%zmm17 , %%zmm17\n"
979
- "vaddpd 64(%[C7]), %%zmm18 , %%zmm18\n"
948
+ "vfmadd213pd 64(%[C0]), %%zmm9 , %%zmm11\n"
949
+ "vfmadd213pd 64(%[C1]), %%zmm9 , %%zmm12\n"
950
+ "vfmadd213pd 64(%[C2]), %%zmm9 , %%zmm13\n"
951
+ "vfmadd213pd 64(%[C3]), %%zmm9 , %%zmm14\n"
952
+ "vfmadd213pd 64(%[C4]), %%zmm9 , %%zmm15\n"
953
+ "vfmadd213pd 64(%[C5]), %%zmm9 , %%zmm16\n"
954
+ "vfmadd213pd 64(%[C6]), %%zmm9 , %%zmm17\n"
955
+ "vfmadd213pd 64(%[C7]), %%zmm9 , %%zmm18\n"
980
956
"vmovupd %%zmm11, 64(%[C0])\n"
981
957
"vmovupd %%zmm12, 64(%[C1])\n"
982
958
"vmovupd %%zmm13, 64(%[C2])\n"
@@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
986
962
"vmovupd %%zmm17, 64(%[C6])\n"
987
963
"vmovupd %%zmm18, 64(%[C7])\n"
988
964
989
- "vaddpd 128(%[C0]), %%zmm21 , %%zmm21\n"
990
- "vaddpd 128(%[C1]), %%zmm22 , %%zmm22\n"
991
- "vaddpd 128(%[C2]), %%zmm23 , %%zmm23\n"
992
- "vaddpd 128(%[C3]), %%zmm24 , %%zmm24\n"
993
- "vaddpd 128(%[C4]), %%zmm25 , %%zmm25\n"
994
- "vaddpd 128(%[C5]), %%zmm26 , %%zmm26\n"
995
- "vaddpd 128(%[C6]), %%zmm27 , %%zmm27\n"
996
- "vaddpd 128(%[C7]), %%zmm28 , %%zmm28\n"
965
+ "vfmadd213pd 128(%[C0]), %%zmm9 , %%zmm21\n"
966
+ "vfmadd213pd 128(%[C1]), %%zmm9 , %%zmm22\n"
967
+ "vfmadd213pd 128(%[C2]), %%zmm9 , %%zmm23\n"
968
+ "vfmadd213pd 128(%[C3]), %%zmm9 , %%zmm24\n"
969
+ "vfmadd213pd 128(%[C4]), %%zmm9 , %%zmm25\n"
970
+ "vfmadd213pd 128(%[C5]), %%zmm9 , %%zmm26\n"
971
+ "vfmadd213pd 128(%[C6]), %%zmm9 , %%zmm27\n"
972
+ "vfmadd213pd 128(%[C7]), %%zmm9 , %%zmm28\n"
997
973
"vmovupd %%zmm21, 128(%[C0])\n"
998
974
"vmovupd %%zmm22, 128(%[C1])\n"
999
975
"vmovupd %%zmm23, 128(%[C2])\n"
@@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
1108
1084
"jg .label16\n"
1109
1085
/* multiply the result by alpha */
1110
1086
"vbroadcastsd (%[alpha]), %%zmm9\n"
1111
- "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1112
- "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1113
- "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1114
- "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1115
- "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1116
- "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1117
- "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1118
- "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1119
- "vmulpd %%zmm9, %%zmm11, %%zmm11\n"
1120
- "vmulpd %%zmm9, %%zmm12, %%zmm12\n"
1121
- "vmulpd %%zmm9, %%zmm13, %%zmm13\n"
1122
- "vmulpd %%zmm9, %%zmm14, %%zmm14\n"
1123
- "vmulpd %%zmm9, %%zmm15, %%zmm15\n"
1124
- "vmulpd %%zmm9, %%zmm16, %%zmm16\n"
1125
- "vmulpd %%zmm9, %%zmm17, %%zmm17\n"
1126
- "vmulpd %%zmm9, %%zmm18, %%zmm18\n"
1127
1087
/* And store additively in C */
1128
- "vaddpd (%[C0]), %%zmm1 , %%zmm1\n"
1129
- "vaddpd (%[C1]), %%zmm2 , %%zmm2\n"
1130
- "vaddpd (%[C2]), %%zmm3 , %%zmm3\n"
1131
- "vaddpd (%[C3]), %%zmm4 , %%zmm4\n"
1132
- "vaddpd (%[C4]), %%zmm5 , %%zmm5\n"
1133
- "vaddpd (%[C5]), %%zmm6 , %%zmm6\n"
1134
- "vaddpd (%[C6]), %%zmm7 , %%zmm7\n"
1135
- "vaddpd (%[C7]), %%zmm8 , %%zmm8\n"
1088
+ "vfmadd213pd (%[C0]), %%zmm9 , %%zmm1\n"
1089
+ "vfmadd213pd (%[C1]), %%zmm9 , %%zmm2\n"
1090
+ "vfmadd213pd (%[C2]), %%zmm9 , %%zmm3\n"
1091
+ "vfmadd213pd (%[C3]), %%zmm9 , %%zmm4\n"
1092
+ "vfmadd213pd (%[C4]), %%zmm9 , %%zmm5\n"
1093
+ "vfmadd213pd (%[C5]), %%zmm9 , %%zmm6\n"
1094
+ "vfmadd213pd (%[C6]), %%zmm9 , %%zmm7\n"
1095
+ "vfmadd213pd (%[C7]), %%zmm9 , %%zmm8\n"
1136
1096
"vmovupd %%zmm1, (%[C0])\n"
1137
1097
"vmovupd %%zmm2, (%[C1])\n"
1138
1098
"vmovupd %%zmm3, (%[C2])\n"
@@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
1142
1102
"vmovupd %%zmm7, (%[C6])\n"
1143
1103
"vmovupd %%zmm8, (%[C7])\n"
1144
1104
1145
- "vaddpd 64(%[C0]), %%zmm11 , %%zmm11\n"
1146
- "vaddpd 64(%[C1]), %%zmm12 , %%zmm12\n"
1147
- "vaddpd 64(%[C2]), %%zmm13 , %%zmm13\n"
1148
- "vaddpd 64(%[C3]), %%zmm14 , %%zmm14\n"
1149
- "vaddpd 64(%[C4]), %%zmm15 , %%zmm15\n"
1150
- "vaddpd 64(%[C5]), %%zmm16 , %%zmm16\n"
1151
- "vaddpd 64(%[C6]), %%zmm17 , %%zmm17\n"
1152
- "vaddpd 64(%[C7]), %%zmm18 , %%zmm18\n"
1105
+ "vfmadd213pd 64(%[C0]), %%zmm9 , %%zmm11\n"
1106
+ "vfmadd213pd 64(%[C1]), %%zmm9 , %%zmm12\n"
1107
+ "vfmadd213pd 64(%[C2]), %%zmm9 , %%zmm13\n"
1108
+ "vfmadd213pd 64(%[C3]), %%zmm9 , %%zmm14\n"
1109
+ "vfmadd213pd 64(%[C4]), %%zmm9 , %%zmm15\n"
1110
+ "vfmadd213pd 64(%[C5]), %%zmm9 , %%zmm16\n"
1111
+ "vfmadd213pd 64(%[C6]), %%zmm9 , %%zmm17\n"
1112
+ "vfmadd213pd 64(%[C7]), %%zmm9 , %%zmm18\n"
1153
1113
"vmovupd %%zmm11, 64(%[C0])\n"
1154
1114
"vmovupd %%zmm12, 64(%[C1])\n"
1155
1115
"vmovupd %%zmm13, 64(%[C2])\n"
@@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
1221
1181
"add $64, %[BO]\n"
1222
1182
"subl $1, %[kloop]\n"
1223
1183
"jg .label1\n"
1224
- /* multiply the result by alpha */
1225
- "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1226
- "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1227
- "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1228
- "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1229
- "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1230
- "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1231
- "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1232
- "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1233
- /* And store additively in C */
1234
- "vaddpd (%[C0]), %%zmm1, %%zmm1\n"
1235
- "vaddpd (%[C1]), %%zmm2, %%zmm2\n"
1236
- "vaddpd (%[C2]), %%zmm3, %%zmm3\n"
1237
- "vaddpd (%[C3]), %%zmm4, %%zmm4\n"
1238
- "vaddpd (%[C4]), %%zmm5, %%zmm5\n"
1239
- "vaddpd (%[C5]), %%zmm6, %%zmm6\n"
1240
- "vaddpd (%[C6]), %%zmm7, %%zmm7\n"
1241
- "vaddpd (%[C7]), %%zmm8, %%zmm8\n"
1184
+ /* multiply the result by alpha and add to the memory */
1185
+ "vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
1186
+ "vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
1187
+ "vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
1188
+ "vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
1189
+ "vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
1190
+ "vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
1191
+ "vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
1192
+ "vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
1242
1193
"vmovupd %%zmm1, (%[C0])\n"
1243
1194
"vmovupd %%zmm2, (%[C1])\n"
1244
1195
"vmovupd %%zmm3, (%[C2])\n"
@@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
1247
1198
"vmovupd %%zmm6, (%[C5])\n"
1248
1199
"vmovupd %%zmm7, (%[C6])\n"
1249
1200
"vmovupd %%zmm8, (%[C7])\n"
1250
- "prefetchw 64(%[C0])\n"
1251
- "prefetchw 64(%[C1])\n"
1252
- "prefetchw 64(%[C2])\n"
1253
- "prefetchw 64(%[C3])\n"
1254
- "prefetchw 64(%[C4])\n"
1255
- "prefetchw 64(%[C5])\n"
1256
- "prefetchw 64(%[C6])\n"
1257
- "prefetchw 64(%[C7])\n"
1258
1201
:
1259
1202
[AO ] "+r" (AO ),
1260
1203
[BO ] "+r" (BO ),
0 commit comments