Skip to content

Commit 6f22e1c

Browse files
authored
Merge pull request #1788 from fenrus75/avx512-8x16
skylake dgemm: Add a 16x8 kernel
2 parents a980953 + 66b43af commit 6f22e1c

File tree

1 file changed

+355
-1
lines changed

1 file changed

+355
-1
lines changed

kernel/x86_64/dgemm_kernel_4x8_skylakex.c

Lines changed: 355 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,18 +849,372 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
849849

850850
i = m;
851851

852-
while (i >= 8) {
852+
while (i >= 24) {
853+
double *BO;
854+
double *A1, *A2;
855+
int kloop = K;
856+
857+
BO = B + 12;
858+
A1 = AO + 8 * K;
859+
A2 = AO + 16 * K;
860+
/*
861+
* This is the inner loop for the hot hot path
862+
* Written in inline asm because compilers like GCC 8 and earlier
863+
* struggle with register allocation and are not good at using
864+
* the AVX512 built in broadcast ability (1to8)
865+
*/
866+
asm(
867+
"vxorpd %%zmm1, %%zmm1, %%zmm1\n"
868+
"vmovapd %%zmm1, %%zmm2\n"
869+
"vmovapd %%zmm1, %%zmm3\n"
870+
"vmovapd %%zmm1, %%zmm4\n"
871+
"vmovapd %%zmm1, %%zmm5\n"
872+
"vmovapd %%zmm1, %%zmm6\n"
873+
"vmovapd %%zmm1, %%zmm7\n"
874+
"vmovapd %%zmm1, %%zmm8\n"
875+
"vmovapd %%zmm1, %%zmm11\n"
876+
"vmovapd %%zmm1, %%zmm12\n"
877+
"vmovapd %%zmm1, %%zmm13\n"
878+
"vmovapd %%zmm1, %%zmm14\n"
879+
"vmovapd %%zmm1, %%zmm15\n"
880+
"vmovapd %%zmm1, %%zmm16\n"
881+
"vmovapd %%zmm1, %%zmm17\n"
882+
"vmovapd %%zmm1, %%zmm18\n"
883+
"vmovapd %%zmm1, %%zmm21\n"
884+
"vmovapd %%zmm1, %%zmm22\n"
885+
"vmovapd %%zmm1, %%zmm23\n"
886+
"vmovapd %%zmm1, %%zmm24\n"
887+
"vmovapd %%zmm1, %%zmm25\n"
888+
"vmovapd %%zmm1, %%zmm26\n"
889+
"vmovapd %%zmm1, %%zmm27\n"
890+
"vmovapd %%zmm1, %%zmm28\n"
891+
"jmp .label24\n"
892+
".align 32\n"
893+
/* Inner math loop */
894+
".label24:\n"
895+
"vmovupd -128(%[AO]),%%zmm0\n"
896+
"vmovupd -128(%[A1]),%%zmm10\n"
897+
"vmovupd -128(%[A2]),%%zmm20\n"
898+
899+
"vbroadcastsd -96(%[BO]), %%zmm9\n"
900+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n"
901+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n"
902+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n"
903+
904+
"vbroadcastsd -88(%[BO]), %%zmm9\n"
905+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n"
906+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n"
907+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n"
908+
909+
"vbroadcastsd -80(%[BO]), %%zmm9\n"
910+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n"
911+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n"
912+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n"
913+
914+
"vbroadcastsd -72(%[BO]), %%zmm9\n"
915+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n"
916+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n"
917+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n"
918+
919+
"vbroadcastsd -64(%[BO]), %%zmm9\n"
920+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n"
921+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n"
922+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n"
923+
924+
"vbroadcastsd -56(%[BO]), %%zmm9\n"
925+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n"
926+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n"
927+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n"
928+
929+
"vbroadcastsd -48(%[BO]), %%zmm9\n"
930+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n"
931+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n"
932+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n"
933+
934+
"vbroadcastsd -40(%[BO]), %%zmm9\n"
935+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n"
936+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n"
937+
"vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n"
938+
"add $64, %[AO]\n"
939+
"add $64, %[A1]\n"
940+
"add $64, %[A2]\n"
941+
"add $64, %[BO]\n"
942+
"prefetch 512(%[AO])\n"
943+
"prefetch 512(%[A1])\n"
944+
"prefetch 512(%[A2])\n"
945+
"prefetch 512(%[BO])\n"
946+
"subl $1, %[kloop]\n"
947+
"jg .label24\n"
948+
/* multiply the result by alpha */
949+
"vbroadcastsd (%[alpha]), %%zmm9\n"
950+
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
951+
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
952+
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
953+
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
954+
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
955+
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
956+
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
957+
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
958+
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
959+
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
960+
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
961+
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
962+
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
963+
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
964+
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
965+
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
966+
"vmulpd %%zmm9, %%zmm21, %%zmm21\n"
967+
"vmulpd %%zmm9, %%zmm22, %%zmm22\n"
968+
"vmulpd %%zmm9, %%zmm23, %%zmm23\n"
969+
"vmulpd %%zmm9, %%zmm24, %%zmm24\n"
970+
"vmulpd %%zmm9, %%zmm25, %%zmm25\n"
971+
"vmulpd %%zmm9, %%zmm26, %%zmm26\n"
972+
"vmulpd %%zmm9, %%zmm27, %%zmm27\n"
973+
"vmulpd %%zmm9, %%zmm28, %%zmm28\n"
974+
/* And store additively in C */
975+
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
976+
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
977+
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
978+
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
979+
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
980+
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
981+
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
982+
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
983+
"vmovupd %%zmm1, (%[C0])\n"
984+
"vmovupd %%zmm2, (%[C1])\n"
985+
"vmovupd %%zmm3, (%[C2])\n"
986+
"vmovupd %%zmm4, (%[C3])\n"
987+
"vmovupd %%zmm5, (%[C4])\n"
988+
"vmovupd %%zmm6, (%[C5])\n"
989+
"vmovupd %%zmm7, (%[C6])\n"
990+
"vmovupd %%zmm8, (%[C7])\n"
991+
992+
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
993+
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
994+
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
995+
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
996+
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
997+
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
998+
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
999+
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
1000+
"vmovupd %%zmm11, 64(%[C0])\n"
1001+
"vmovupd %%zmm12, 64(%[C1])\n"
1002+
"vmovupd %%zmm13, 64(%[C2])\n"
1003+
"vmovupd %%zmm14, 64(%[C3])\n"
1004+
"vmovupd %%zmm15, 64(%[C4])\n"
1005+
"vmovupd %%zmm16, 64(%[C5])\n"
1006+
"vmovupd %%zmm17, 64(%[C6])\n"
1007+
"vmovupd %%zmm18, 64(%[C7])\n"
1008+
1009+
"vaddpd 128(%[C0]), %%zmm21, %%zmm21\n"
1010+
"vaddpd 128(%[C1]), %%zmm22, %%zmm22\n"
1011+
"vaddpd 128(%[C2]), %%zmm23, %%zmm23\n"
1012+
"vaddpd 128(%[C3]), %%zmm24, %%zmm24\n"
1013+
"vaddpd 128(%[C4]), %%zmm25, %%zmm25\n"
1014+
"vaddpd 128(%[C5]), %%zmm26, %%zmm26\n"
1015+
"vaddpd 128(%[C6]), %%zmm27, %%zmm27\n"
1016+
"vaddpd 128(%[C7]), %%zmm28, %%zmm28\n"
1017+
"vmovupd %%zmm21, 128(%[C0])\n"
1018+
"vmovupd %%zmm22, 128(%[C1])\n"
1019+
"vmovupd %%zmm23, 128(%[C2])\n"
1020+
"vmovupd %%zmm24, 128(%[C3])\n"
1021+
"vmovupd %%zmm25, 128(%[C4])\n"
1022+
"vmovupd %%zmm26, 128(%[C5])\n"
1023+
"vmovupd %%zmm27, 128(%[C6])\n"
1024+
"vmovupd %%zmm28, 128(%[C7])\n"
1025+
1026+
:
1027+
[AO] "+r" (AO),
1028+
[A1] "+r" (A1),
1029+
[A2] "+r" (A2),
1030+
[BO] "+r" (BO),
1031+
[C0] "+r" (CO1),
1032+
[kloop] "+r" (kloop)
1033+
:
1034+
[alpha] "r" (&alpha),
1035+
[C1] "r" (CO1 + 1 * ldc),
1036+
[C2] "r" (CO1 + 2 * ldc),
1037+
[C3] "r" (CO1 + 3 * ldc),
1038+
[C4] "r" (CO1 + 4 * ldc),
1039+
[C5] "r" (CO1 + 5 * ldc),
1040+
[C6] "r" (CO1 + 6 * ldc),
1041+
[C7] "r" (CO1 + 7 * ldc)
1042+
1043+
: "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9",
1044+
"zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18",
1045+
"zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28"
1046+
);
1047+
CO1 += 24;
1048+
AO += 16 * K;
1049+
i-= 24;
1050+
}
1051+
1052+
1053+
while (i >= 16) {
8531054
double *BO;
1055+
double *A1;
8541056
int kloop = K;
8551057

8561058
BO = B + 12;
1059+
A1 = AO + 8 * K;
8571060
/*
8581061
* This is the inner loop for the hot hot path
8591062
* Written in inline asm because compilers like GCC 8 and earlier
8601063
* struggle with register allocation and are not good at using
8611064
* the AVX512 built in broadcast ability (1to8)
8621065
*/
8631066
asm(
1067+
"vxorpd %%zmm1, %%zmm1, %%zmm1\n"
1068+
"vmovapd %%zmm1, %%zmm2\n"
1069+
"vmovapd %%zmm1, %%zmm3\n"
1070+
"vmovapd %%zmm1, %%zmm4\n"
1071+
"vmovapd %%zmm1, %%zmm5\n"
1072+
"vmovapd %%zmm1, %%zmm6\n"
1073+
"vmovapd %%zmm1, %%zmm7\n"
1074+
"vmovapd %%zmm1, %%zmm8\n"
1075+
"vmovapd %%zmm1, %%zmm11\n"
1076+
"vmovapd %%zmm1, %%zmm12\n"
1077+
"vmovapd %%zmm1, %%zmm13\n"
1078+
"vmovapd %%zmm1, %%zmm14\n"
1079+
"vmovapd %%zmm1, %%zmm15\n"
1080+
"vmovapd %%zmm1, %%zmm16\n"
1081+
"vmovapd %%zmm1, %%zmm17\n"
1082+
"vmovapd %%zmm1, %%zmm18\n"
1083+
"jmp .label16\n"
1084+
".align 32\n"
1085+
/* Inner math loop */
1086+
".label16:\n"
1087+
"vmovupd -128(%[AO]),%%zmm0\n"
1088+
"vmovupd -128(%[A1]),%%zmm10\n"
1089+
1090+
"vbroadcastsd -96(%[BO]), %%zmm9\n"
1091+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n"
1092+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n"
1093+
1094+
"vbroadcastsd -88(%[BO]), %%zmm9\n"
1095+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n"
1096+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n"
1097+
1098+
"vbroadcastsd -80(%[BO]), %%zmm9\n"
1099+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n"
1100+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n"
1101+
1102+
"vbroadcastsd -72(%[BO]), %%zmm9\n"
1103+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n"
1104+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n"
1105+
1106+
"vbroadcastsd -64(%[BO]), %%zmm9\n"
1107+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n"
1108+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n"
1109+
1110+
"vbroadcastsd -56(%[BO]), %%zmm9\n"
1111+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n"
1112+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n"
1113+
1114+
"vbroadcastsd -48(%[BO]), %%zmm9\n"
1115+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n"
1116+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n"
1117+
1118+
"vbroadcastsd -40(%[BO]), %%zmm9\n"
1119+
"vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n"
1120+
"vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n"
1121+
"add $64, %[AO]\n"
1122+
"add $64, %[A1]\n"
1123+
"add $64, %[BO]\n"
1124+
"prefetch 512(%[AO])\n"
1125+
"prefetch 512(%[A1])\n"
1126+
"prefetch 512(%[BO])\n"
1127+
"subl $1, %[kloop]\n"
1128+
"jg .label16\n"
1129+
/* multiply the result by alpha */
1130+
"vbroadcastsd (%[alpha]), %%zmm9\n"
1131+
"vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1132+
"vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1133+
"vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1134+
"vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1135+
"vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1136+
"vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1137+
"vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1138+
"vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1139+
"vmulpd %%zmm9, %%zmm11, %%zmm11\n"
1140+
"vmulpd %%zmm9, %%zmm12, %%zmm12\n"
1141+
"vmulpd %%zmm9, %%zmm13, %%zmm13\n"
1142+
"vmulpd %%zmm9, %%zmm14, %%zmm14\n"
1143+
"vmulpd %%zmm9, %%zmm15, %%zmm15\n"
1144+
"vmulpd %%zmm9, %%zmm16, %%zmm16\n"
1145+
"vmulpd %%zmm9, %%zmm17, %%zmm17\n"
1146+
"vmulpd %%zmm9, %%zmm18, %%zmm18\n"
1147+
/* And store additively in C */
1148+
"vaddpd (%[C0]), %%zmm1, %%zmm1\n"
1149+
"vaddpd (%[C1]), %%zmm2, %%zmm2\n"
1150+
"vaddpd (%[C2]), %%zmm3, %%zmm3\n"
1151+
"vaddpd (%[C3]), %%zmm4, %%zmm4\n"
1152+
"vaddpd (%[C4]), %%zmm5, %%zmm5\n"
1153+
"vaddpd (%[C5]), %%zmm6, %%zmm6\n"
1154+
"vaddpd (%[C6]), %%zmm7, %%zmm7\n"
1155+
"vaddpd (%[C7]), %%zmm8, %%zmm8\n"
1156+
"vmovupd %%zmm1, (%[C0])\n"
1157+
"vmovupd %%zmm2, (%[C1])\n"
1158+
"vmovupd %%zmm3, (%[C2])\n"
1159+
"vmovupd %%zmm4, (%[C3])\n"
1160+
"vmovupd %%zmm5, (%[C4])\n"
1161+
"vmovupd %%zmm6, (%[C5])\n"
1162+
"vmovupd %%zmm7, (%[C6])\n"
1163+
"vmovupd %%zmm8, (%[C7])\n"
1164+
1165+
"vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
1166+
"vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
1167+
"vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
1168+
"vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
1169+
"vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
1170+
"vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
1171+
"vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
1172+
"vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
1173+
"vmovupd %%zmm11, 64(%[C0])\n"
1174+
"vmovupd %%zmm12, 64(%[C1])\n"
1175+
"vmovupd %%zmm13, 64(%[C2])\n"
1176+
"vmovupd %%zmm14, 64(%[C3])\n"
1177+
"vmovupd %%zmm15, 64(%[C4])\n"
1178+
"vmovupd %%zmm16, 64(%[C5])\n"
1179+
"vmovupd %%zmm17, 64(%[C6])\n"
1180+
"vmovupd %%zmm18, 64(%[C7])\n"
1181+
1182+
:
1183+
[AO] "+r" (AO),
1184+
[A1] "+r" (A1),
1185+
[BO] "+r" (BO),
1186+
[C0] "+r" (CO1),
1187+
[kloop] "+r" (kloop)
1188+
:
1189+
[alpha] "r" (&alpha),
1190+
[C1] "r" (CO1 + 1 * ldc),
1191+
[C2] "r" (CO1 + 2 * ldc),
1192+
[C3] "r" (CO1 + 3 * ldc),
1193+
[C4] "r" (CO1 + 4 * ldc),
1194+
[C5] "r" (CO1 + 5 * ldc),
1195+
[C6] "r" (CO1 + 6 * ldc),
1196+
[C7] "r" (CO1 + 7 * ldc)
1197+
1198+
: "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9",
1199+
"zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18"
1200+
);
1201+
CO1 += 16;
1202+
AO += 8 * K;
1203+
i-= 16;
1204+
}
1205+
1206+
while (i >= 8) {
1207+
double *BO;
1208+
int kloop = K;
1209+
1210+
BO = B + 12;
1211+
/*
1212+
* This is the inner loop for the hot hot path
1213+
* Written in inline asm because compilers like GCC 8 and earlier
1214+
* struggle with register allocation and are not good at using
1215+
* the AVX512 built in broadcast ability (1to8)
1216+
*/
1217+
asm(
8641218
"vxorpd %%zmm1, %%zmm1, %%zmm1\n"
8651219
"vmovapd %%zmm1, %%zmm2\n"
8661220
"vmovapd %%zmm1, %%zmm3\n"

0 commit comments

Comments
 (0)