Skip to content

Commit 582c589

Browse files
committed
dgemm/skylakex: replace discrete mul/add with fma
very minor gains since it's not super hot code, but general principles
1 parent adbf6af commit 582c589

File tree

1 file changed

+49
-106
lines changed

1 file changed

+49
-106
lines changed

kernel/x86_64/dgemm_kernel_4x8_skylakex.c

Lines changed: 49 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -927,39 +927,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
927927
"jg .label24\n"
928928
/* multiply the result by alpha */
929929
"vbroadcastsd (%[alpha]), %%zmm9\n"
930-
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
931-
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
932-
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
933-
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
934-
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
935-
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
936-
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
937-
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
938-
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
939-
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
940-
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
941-
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
942-
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
943-
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
944-
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
945-
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
946-
"vmulpd %%zmm9, %%zmm21, %%zmm21\n"
947-
"vmulpd %%zmm9, %%zmm22, %%zmm22\n"
948-
"vmulpd %%zmm9, %%zmm23, %%zmm23\n"
949-
"vmulpd %%zmm9, %%zmm24, %%zmm24\n"
950-
"vmulpd %%zmm9, %%zmm25, %%zmm25\n"
951-
"vmulpd %%zmm9, %%zmm26, %%zmm26\n"
952-
"vmulpd %%zmm9, %%zmm27, %%zmm27\n"
953-
"vmulpd %%zmm9, %%zmm28, %%zmm28\n"
954930
/* And store additively in C */
955-
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
956-
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
957-
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
958-
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
959-
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
960-
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
961-
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
962-
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
931+
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
932+
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
933+
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
934+
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
935+
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
936+
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
937+
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
938+
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
963939
"vmovupd %%zmm1, (%[C0])\n"
964940
"vmovupd %%zmm2, (%[C1])\n"
965941
"vmovupd %%zmm3, (%[C2])\n"
@@ -969,14 +945,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
969945
"vmovupd %%zmm7, (%[C6])\n"
970946
"vmovupd %%zmm8, (%[C7])\n"
971947

972-
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
973-
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
974-
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
975-
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
976-
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
977-
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
978-
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
979-
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
948+
"vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n"
949+
"vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n"
950+
"vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n"
951+
"vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n"
952+
"vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n"
953+
"vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n"
954+
"vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n"
955+
"vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n"
980956
"vmovupd %%zmm11, 64(%[C0])\n"
981957
"vmovupd %%zmm12, 64(%[C1])\n"
982958
"vmovupd %%zmm13, 64(%[C2])\n"
@@ -986,14 +962,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
986962
"vmovupd %%zmm17, 64(%[C6])\n"
987963
"vmovupd %%zmm18, 64(%[C7])\n"
988964

989-
"vaddpd 128(%[C0]), %%zmm21, %%zmm21\n"
990-
"vaddpd 128(%[C1]), %%zmm22, %%zmm22\n"
991-
"vaddpd 128(%[C2]), %%zmm23, %%zmm23\n"
992-
"vaddpd 128(%[C3]), %%zmm24, %%zmm24\n"
993-
"vaddpd 128(%[C4]), %%zmm25, %%zmm25\n"
994-
"vaddpd 128(%[C5]), %%zmm26, %%zmm26\n"
995-
"vaddpd 128(%[C6]), %%zmm27, %%zmm27\n"
996-
"vaddpd 128(%[C7]), %%zmm28, %%zmm28\n"
965+
"vfmadd213pd 128(%[C0]), %%zmm9, %%zmm21\n"
966+
"vfmadd213pd 128(%[C1]), %%zmm9, %%zmm22\n"
967+
"vfmadd213pd 128(%[C2]), %%zmm9, %%zmm23\n"
968+
"vfmadd213pd 128(%[C3]), %%zmm9, %%zmm24\n"
969+
"vfmadd213pd 128(%[C4]), %%zmm9, %%zmm25\n"
970+
"vfmadd213pd 128(%[C5]), %%zmm9, %%zmm26\n"
971+
"vfmadd213pd 128(%[C6]), %%zmm9, %%zmm27\n"
972+
"vfmadd213pd 128(%[C7]), %%zmm9, %%zmm28\n"
997973
"vmovupd %%zmm21, 128(%[C0])\n"
998974
"vmovupd %%zmm22, 128(%[C1])\n"
999975
"vmovupd %%zmm23, 128(%[C2])\n"
@@ -1108,31 +1084,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
11081084
"jg .label16\n"
11091085
/* multiply the result by alpha */
11101086
"vbroadcastsd (%[alpha]), %%zmm9\n"
1111-
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1112-
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1113-
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1114-
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1115-
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1116-
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1117-
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1118-
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1119-
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
1120-
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
1121-
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
1122-
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
1123-
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
1124-
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
1125-
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
1126-
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
11271087
/* And store additively in C */
1128-
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
1129-
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
1130-
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
1131-
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
1132-
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
1133-
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
1134-
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
1135-
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
1088+
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
1089+
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
1090+
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
1091+
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
1092+
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
1093+
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
1094+
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
1095+
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
11361096
"vmovupd %%zmm1, (%[C0])\n"
11371097
"vmovupd %%zmm2, (%[C1])\n"
11381098
"vmovupd %%zmm3, (%[C2])\n"
@@ -1142,14 +1102,14 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
11421102
"vmovupd %%zmm7, (%[C6])\n"
11431103
"vmovupd %%zmm8, (%[C7])\n"
11441104

1145-
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
1146-
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
1147-
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
1148-
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
1149-
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
1150-
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
1151-
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
1152-
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
1105+
"vfmadd213pd 64(%[C0]), %%zmm9, %%zmm11\n"
1106+
"vfmadd213pd 64(%[C1]), %%zmm9, %%zmm12\n"
1107+
"vfmadd213pd 64(%[C2]), %%zmm9, %%zmm13\n"
1108+
"vfmadd213pd 64(%[C3]), %%zmm9, %%zmm14\n"
1109+
"vfmadd213pd 64(%[C4]), %%zmm9, %%zmm15\n"
1110+
"vfmadd213pd 64(%[C5]), %%zmm9, %%zmm16\n"
1111+
"vfmadd213pd 64(%[C6]), %%zmm9, %%zmm17\n"
1112+
"vfmadd213pd 64(%[C7]), %%zmm9, %%zmm18\n"
11531113
"vmovupd %%zmm11, 64(%[C0])\n"
11541114
"vmovupd %%zmm12, 64(%[C1])\n"
11551115
"vmovupd %%zmm13, 64(%[C2])\n"
@@ -1221,24 +1181,15 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
12211181
"add $64, %[BO]\n"
12221182
"subl $1, %[kloop]\n"
12231183
"jg .label1\n"
1224-
/* multiply the result by alpha */
1225-
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1226-
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1227-
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1228-
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1229-
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1230-
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1231-
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1232-
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1233-
/* And store additively in C */
1234-
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
1235-
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
1236-
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
1237-
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
1238-
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
1239-
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
1240-
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
1241-
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
1184+
/* multiply the result by alpha and add to the memory */
1185+
"vfmadd213pd (%[C0]), %%zmm9, %%zmm1\n"
1186+
"vfmadd213pd (%[C1]), %%zmm9, %%zmm2\n"
1187+
"vfmadd213pd (%[C2]), %%zmm9, %%zmm3\n"
1188+
"vfmadd213pd (%[C3]), %%zmm9, %%zmm4\n"
1189+
"vfmadd213pd (%[C4]), %%zmm9, %%zmm5\n"
1190+
"vfmadd213pd (%[C5]), %%zmm9, %%zmm6\n"
1191+
"vfmadd213pd (%[C6]), %%zmm9, %%zmm7\n"
1192+
"vfmadd213pd (%[C7]), %%zmm9, %%zmm8\n"
12421193
"vmovupd %%zmm1, (%[C0])\n"
12431194
"vmovupd %%zmm2, (%[C1])\n"
12441195
"vmovupd %%zmm3, (%[C2])\n"
@@ -1247,14 +1198,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
12471198
"vmovupd %%zmm6, (%[C5])\n"
12481199
"vmovupd %%zmm7, (%[C6])\n"
12491200
"vmovupd %%zmm8, (%[C7])\n"
1250-
"prefetchw 64(%[C0])\n"
1251-
"prefetchw 64(%[C1])\n"
1252-
"prefetchw 64(%[C2])\n"
1253-
"prefetchw 64(%[C3])\n"
1254-
"prefetchw 64(%[C4])\n"
1255-
"prefetchw 64(%[C5])\n"
1256-
"prefetchw 64(%[C6])\n"
1257-
"prefetchw 64(%[C7])\n"
12581201
:
12591202
[AO] "+r" (AO),
12601203
[BO] "+r" (BO),

0 commit comments

Comments
 (0)