@@ -849,18 +849,372 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
849
849
850
850
i = m ;
851
851
852
- while (i >= 8 ) {
852
+ while (i >= 24 ) {
853
+ double * BO ;
854
+ double * A1 , * A2 ;
855
+ int kloop = K ;
856
+
857
+ BO = B + 12 ;
858
+ A1 = AO + 8 * K ;
859
+ A2 = AO + 16 * K ;
860
+ /*
861
+ * This is the inner loop for the hot hot path
862
+ * Written in inline asm because compilers like GCC 8 and earlier
863
+ * struggle with register allocation and are not good at using
864
+ * the AVX512 built in broadcast ability (1to8)
865
+ */
866
+ asm(
867
+ "vxorpd %%zmm1, %%zmm1, %%zmm1\n"
868
+ "vmovapd %%zmm1, %%zmm2\n"
869
+ "vmovapd %%zmm1, %%zmm3\n"
870
+ "vmovapd %%zmm1, %%zmm4\n"
871
+ "vmovapd %%zmm1, %%zmm5\n"
872
+ "vmovapd %%zmm1, %%zmm6\n"
873
+ "vmovapd %%zmm1, %%zmm7\n"
874
+ "vmovapd %%zmm1, %%zmm8\n"
875
+ "vmovapd %%zmm1, %%zmm11\n"
876
+ "vmovapd %%zmm1, %%zmm12\n"
877
+ "vmovapd %%zmm1, %%zmm13\n"
878
+ "vmovapd %%zmm1, %%zmm14\n"
879
+ "vmovapd %%zmm1, %%zmm15\n"
880
+ "vmovapd %%zmm1, %%zmm16\n"
881
+ "vmovapd %%zmm1, %%zmm17\n"
882
+ "vmovapd %%zmm1, %%zmm18\n"
883
+ "vmovapd %%zmm1, %%zmm21\n"
884
+ "vmovapd %%zmm1, %%zmm22\n"
885
+ "vmovapd %%zmm1, %%zmm23\n"
886
+ "vmovapd %%zmm1, %%zmm24\n"
887
+ "vmovapd %%zmm1, %%zmm25\n"
888
+ "vmovapd %%zmm1, %%zmm26\n"
889
+ "vmovapd %%zmm1, %%zmm27\n"
890
+ "vmovapd %%zmm1, %%zmm28\n"
891
+ "jmp .label24\n"
892
+ ".align 32\n"
893
+ /* Inner math loop */
894
+ ".label24:\n"
895
+ "vmovupd -128(%[AO]),%%zmm0\n"
896
+ "vmovupd -128(%[A1]),%%zmm10\n"
897
+ "vmovupd -128(%[A2]),%%zmm20\n"
898
+
899
+ "vbroadcastsd -96(%[BO]), %%zmm9\n"
900
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n"
901
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n"
902
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n"
903
+
904
+ "vbroadcastsd -88(%[BO]), %%zmm9\n"
905
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n"
906
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n"
907
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n"
908
+
909
+ "vbroadcastsd -80(%[BO]), %%zmm9\n"
910
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n"
911
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n"
912
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n"
913
+
914
+ "vbroadcastsd -72(%[BO]), %%zmm9\n"
915
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n"
916
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n"
917
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n"
918
+
919
+ "vbroadcastsd -64(%[BO]), %%zmm9\n"
920
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n"
921
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n"
922
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n"
923
+
924
+ "vbroadcastsd -56(%[BO]), %%zmm9\n"
925
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n"
926
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n"
927
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n"
928
+
929
+ "vbroadcastsd -48(%[BO]), %%zmm9\n"
930
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n"
931
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n"
932
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n"
933
+
934
+ "vbroadcastsd -40(%[BO]), %%zmm9\n"
935
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n"
936
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n"
937
+ "vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n"
938
+ "add $64, %[AO]\n"
939
+ "add $64, %[A1]\n"
940
+ "add $64, %[A2]\n"
941
+ "add $64, %[BO]\n"
942
+ "prefetch 512(%[AO])\n"
943
+ "prefetch 512(%[A1])\n"
944
+ "prefetch 512(%[A2])\n"
945
+ "prefetch 512(%[BO])\n"
946
+ "subl $1, %[kloop]\n"
947
+ "jg .label24\n"
948
+ /* multiply the result by alpha */
949
+ "vbroadcastsd (%[alpha]), %%zmm9\n"
950
+ "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
951
+ "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
952
+ "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
953
+ "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
954
+ "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
955
+ "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
956
+ "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
957
+ "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
958
+ "vmulpd %%zmm9, %%zmm11, %%zmm11\n"
959
+ "vmulpd %%zmm9, %%zmm12, %%zmm12\n"
960
+ "vmulpd %%zmm9, %%zmm13, %%zmm13\n"
961
+ "vmulpd %%zmm9, %%zmm14, %%zmm14\n"
962
+ "vmulpd %%zmm9, %%zmm15, %%zmm15\n"
963
+ "vmulpd %%zmm9, %%zmm16, %%zmm16\n"
964
+ "vmulpd %%zmm9, %%zmm17, %%zmm17\n"
965
+ "vmulpd %%zmm9, %%zmm18, %%zmm18\n"
966
+ "vmulpd %%zmm9, %%zmm21, %%zmm21\n"
967
+ "vmulpd %%zmm9, %%zmm22, %%zmm22\n"
968
+ "vmulpd %%zmm9, %%zmm23, %%zmm23\n"
969
+ "vmulpd %%zmm9, %%zmm24, %%zmm24\n"
970
+ "vmulpd %%zmm9, %%zmm25, %%zmm25\n"
971
+ "vmulpd %%zmm9, %%zmm26, %%zmm26\n"
972
+ "vmulpd %%zmm9, %%zmm27, %%zmm27\n"
973
+ "vmulpd %%zmm9, %%zmm28, %%zmm28\n"
974
+ /* And store additively in C */
975
+ "vaddpd (%[C0]), %%zmm1, %%zmm1\n"
976
+ "vaddpd (%[C1]), %%zmm2, %%zmm2\n"
977
+ "vaddpd (%[C2]), %%zmm3, %%zmm3\n"
978
+ "vaddpd (%[C3]), %%zmm4, %%zmm4\n"
979
+ "vaddpd (%[C4]), %%zmm5, %%zmm5\n"
980
+ "vaddpd (%[C5]), %%zmm6, %%zmm6\n"
981
+ "vaddpd (%[C6]), %%zmm7, %%zmm7\n"
982
+ "vaddpd (%[C7]), %%zmm8, %%zmm8\n"
983
+ "vmovupd %%zmm1, (%[C0])\n"
984
+ "vmovupd %%zmm2, (%[C1])\n"
985
+ "vmovupd %%zmm3, (%[C2])\n"
986
+ "vmovupd %%zmm4, (%[C3])\n"
987
+ "vmovupd %%zmm5, (%[C4])\n"
988
+ "vmovupd %%zmm6, (%[C5])\n"
989
+ "vmovupd %%zmm7, (%[C6])\n"
990
+ "vmovupd %%zmm8, (%[C7])\n"
991
+
992
+ "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
993
+ "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
994
+ "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
995
+ "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
996
+ "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
997
+ "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
998
+ "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
999
+ "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
1000
+ "vmovupd %%zmm11, 64(%[C0])\n"
1001
+ "vmovupd %%zmm12, 64(%[C1])\n"
1002
+ "vmovupd %%zmm13, 64(%[C2])\n"
1003
+ "vmovupd %%zmm14, 64(%[C3])\n"
1004
+ "vmovupd %%zmm15, 64(%[C4])\n"
1005
+ "vmovupd %%zmm16, 64(%[C5])\n"
1006
+ "vmovupd %%zmm17, 64(%[C6])\n"
1007
+ "vmovupd %%zmm18, 64(%[C7])\n"
1008
+
1009
+ "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n"
1010
+ "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n"
1011
+ "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n"
1012
+ "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n"
1013
+ "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n"
1014
+ "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n"
1015
+ "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n"
1016
+ "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n"
1017
+ "vmovupd %%zmm21, 128(%[C0])\n"
1018
+ "vmovupd %%zmm22, 128(%[C1])\n"
1019
+ "vmovupd %%zmm23, 128(%[C2])\n"
1020
+ "vmovupd %%zmm24, 128(%[C3])\n"
1021
+ "vmovupd %%zmm25, 128(%[C4])\n"
1022
+ "vmovupd %%zmm26, 128(%[C5])\n"
1023
+ "vmovupd %%zmm27, 128(%[C6])\n"
1024
+ "vmovupd %%zmm28, 128(%[C7])\n"
1025
+
1026
+ :
1027
+ [AO ] "+r" (AO ),
1028
+ [A1 ] "+r" (A1 ),
1029
+ [A2 ] "+r" (A2 ),
1030
+ [BO ] "+r" (BO ),
1031
+ [C0 ] "+r" (CO1 ),
1032
+ [kloop ] "+r" (kloop )
1033
+ :
1034
+ [alpha ] "r" (& alpha ),
1035
+ [C1 ] "r" (CO1 + 1 * ldc ),
1036
+ [C2 ] "r" (CO1 + 2 * ldc ),
1037
+ [C3 ] "r" (CO1 + 3 * ldc ),
1038
+ [C4 ] "r" (CO1 + 4 * ldc ),
1039
+ [C5 ] "r" (CO1 + 5 * ldc ),
1040
+ [C6 ] "r" (CO1 + 6 * ldc ),
1041
+ [C7 ] "r" (CO1 + 7 * ldc )
1042
+
1043
+ : "memory" , "zmm0" , "zmm1" , "zmm2" , "zmm3" , "zmm4" , "zmm5" , "zmm6" , "zmm7" , "zmm8" , "zmm9" ,
1044
+ "zmm10" , "zmm11" , "zmm12" , "zmm13" , "zmm14" , "zmm15" , "zmm16" , "zmm17" , "zmm18" ,
1045
+ "zmm20" , "zmm21" , "zmm22" , "zmm23" , "zmm24" , "zmm25" , "zmm26" , "zmm27" , "zmm28"
1046
+ );
1047
+ CO1 += 24 ;
1048
+ AO += 16 * K ;
1049
+ i -= 24 ;
1050
+ }
1051
+
1052
+
1053
+ while (i >= 16 ) {
853
1054
double * BO ;
1055
+ double * A1 ;
854
1056
int kloop = K ;
855
1057
856
1058
BO = B + 12 ;
1059
+ A1 = AO + 8 * K ;
857
1060
/*
858
1061
* This is the inner loop for the hot hot path
859
1062
* Written in inline asm because compilers like GCC 8 and earlier
860
1063
* struggle with register allocation and are not good at using
861
1064
* the AVX512 built in broadcast ability (1to8)
862
1065
*/
863
1066
asm(
1067
+ "vxorpd %%zmm1, %%zmm1, %%zmm1\n"
1068
+ "vmovapd %%zmm1, %%zmm2\n"
1069
+ "vmovapd %%zmm1, %%zmm3\n"
1070
+ "vmovapd %%zmm1, %%zmm4\n"
1071
+ "vmovapd %%zmm1, %%zmm5\n"
1072
+ "vmovapd %%zmm1, %%zmm6\n"
1073
+ "vmovapd %%zmm1, %%zmm7\n"
1074
+ "vmovapd %%zmm1, %%zmm8\n"
1075
+ "vmovapd %%zmm1, %%zmm11\n"
1076
+ "vmovapd %%zmm1, %%zmm12\n"
1077
+ "vmovapd %%zmm1, %%zmm13\n"
1078
+ "vmovapd %%zmm1, %%zmm14\n"
1079
+ "vmovapd %%zmm1, %%zmm15\n"
1080
+ "vmovapd %%zmm1, %%zmm16\n"
1081
+ "vmovapd %%zmm1, %%zmm17\n"
1082
+ "vmovapd %%zmm1, %%zmm18\n"
1083
+ "jmp .label16\n"
1084
+ ".align 32\n"
1085
+ /* Inner math loop */
1086
+ ".label16:\n"
1087
+ "vmovupd -128(%[AO]),%%zmm0\n"
1088
+ "vmovupd -128(%[A1]),%%zmm10\n"
1089
+
1090
+ "vbroadcastsd -96(%[BO]), %%zmm9\n"
1091
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n"
1092
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n"
1093
+
1094
+ "vbroadcastsd -88(%[BO]), %%zmm9\n"
1095
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n"
1096
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n"
1097
+
1098
+ "vbroadcastsd -80(%[BO]), %%zmm9\n"
1099
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n"
1100
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n"
1101
+
1102
+ "vbroadcastsd -72(%[BO]), %%zmm9\n"
1103
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n"
1104
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n"
1105
+
1106
+ "vbroadcastsd -64(%[BO]), %%zmm9\n"
1107
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n"
1108
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n"
1109
+
1110
+ "vbroadcastsd -56(%[BO]), %%zmm9\n"
1111
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n"
1112
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n"
1113
+
1114
+ "vbroadcastsd -48(%[BO]), %%zmm9\n"
1115
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n"
1116
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n"
1117
+
1118
+ "vbroadcastsd -40(%[BO]), %%zmm9\n"
1119
+ "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n"
1120
+ "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n"
1121
+ "add $64, %[AO]\n"
1122
+ "add $64, %[A1]\n"
1123
+ "add $64, %[BO]\n"
1124
+ "prefetch 512(%[AO])\n"
1125
+ "prefetch 512(%[A1])\n"
1126
+ "prefetch 512(%[BO])\n"
1127
+ "subl $1, %[kloop]\n"
1128
+ "jg .label16\n"
1129
+ /* multiply the result by alpha */
1130
+ "vbroadcastsd (%[alpha]), %%zmm9\n"
1131
+ "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
1132
+ "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
1133
+ "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
1134
+ "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
1135
+ "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
1136
+ "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
1137
+ "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
1138
+ "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
1139
+ "vmulpd %%zmm9, %%zmm11, %%zmm11\n"
1140
+ "vmulpd %%zmm9, %%zmm12, %%zmm12\n"
1141
+ "vmulpd %%zmm9, %%zmm13, %%zmm13\n"
1142
+ "vmulpd %%zmm9, %%zmm14, %%zmm14\n"
1143
+ "vmulpd %%zmm9, %%zmm15, %%zmm15\n"
1144
+ "vmulpd %%zmm9, %%zmm16, %%zmm16\n"
1145
+ "vmulpd %%zmm9, %%zmm17, %%zmm17\n"
1146
+ "vmulpd %%zmm9, %%zmm18, %%zmm18\n"
1147
+ /* And store additively in C */
1148
+ "vaddpd (%[C0]), %%zmm1, %%zmm1\n"
1149
+ "vaddpd (%[C1]), %%zmm2, %%zmm2\n"
1150
+ "vaddpd (%[C2]), %%zmm3, %%zmm3\n"
1151
+ "vaddpd (%[C3]), %%zmm4, %%zmm4\n"
1152
+ "vaddpd (%[C4]), %%zmm5, %%zmm5\n"
1153
+ "vaddpd (%[C5]), %%zmm6, %%zmm6\n"
1154
+ "vaddpd (%[C6]), %%zmm7, %%zmm7\n"
1155
+ "vaddpd (%[C7]), %%zmm8, %%zmm8\n"
1156
+ "vmovupd %%zmm1, (%[C0])\n"
1157
+ "vmovupd %%zmm2, (%[C1])\n"
1158
+ "vmovupd %%zmm3, (%[C2])\n"
1159
+ "vmovupd %%zmm4, (%[C3])\n"
1160
+ "vmovupd %%zmm5, (%[C4])\n"
1161
+ "vmovupd %%zmm6, (%[C5])\n"
1162
+ "vmovupd %%zmm7, (%[C6])\n"
1163
+ "vmovupd %%zmm8, (%[C7])\n"
1164
+
1165
+ "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n"
1166
+ "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n"
1167
+ "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n"
1168
+ "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n"
1169
+ "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n"
1170
+ "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n"
1171
+ "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n"
1172
+ "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n"
1173
+ "vmovupd %%zmm11, 64(%[C0])\n"
1174
+ "vmovupd %%zmm12, 64(%[C1])\n"
1175
+ "vmovupd %%zmm13, 64(%[C2])\n"
1176
+ "vmovupd %%zmm14, 64(%[C3])\n"
1177
+ "vmovupd %%zmm15, 64(%[C4])\n"
1178
+ "vmovupd %%zmm16, 64(%[C5])\n"
1179
+ "vmovupd %%zmm17, 64(%[C6])\n"
1180
+ "vmovupd %%zmm18, 64(%[C7])\n"
1181
+
1182
+ :
1183
+ [AO ] "+r" (AO ),
1184
+ [A1 ] "+r" (A1 ),
1185
+ [BO ] "+r" (BO ),
1186
+ [C0 ] "+r" (CO1 ),
1187
+ [kloop ] "+r" (kloop )
1188
+ :
1189
+ [alpha ] "r" (& alpha ),
1190
+ [C1 ] "r" (CO1 + 1 * ldc ),
1191
+ [C2 ] "r" (CO1 + 2 * ldc ),
1192
+ [C3 ] "r" (CO1 + 3 * ldc ),
1193
+ [C4 ] "r" (CO1 + 4 * ldc ),
1194
+ [C5 ] "r" (CO1 + 5 * ldc ),
1195
+ [C6 ] "r" (CO1 + 6 * ldc ),
1196
+ [C7 ] "r" (CO1 + 7 * ldc )
1197
+
1198
+ : "memory" , "zmm0" , "zmm1" , "zmm2" , "zmm3" , "zmm4" , "zmm5" , "zmm6" , "zmm7" , "zmm8" , "zmm9" ,
1199
+ "zmm10" , "zmm11" , "zmm12" , "zmm13" , "zmm14" , "zmm15" , "zmm16" , "zmm17" , "zmm18"
1200
+ );
1201
+ CO1 += 16 ;
1202
+ AO += 8 * K ;
1203
+ i -= 16 ;
1204
+ }
1205
+
1206
+ while (i >= 8 ) {
1207
+ double * BO ;
1208
+ int kloop = K ;
1209
+
1210
+ BO = B + 12 ;
1211
+ /*
1212
+ * This is the inner loop for the hot hot path
1213
+ * Written in inline asm because compilers like GCC 8 and earlier
1214
+ * struggle with register allocation and are not good at using
1215
+ * the AVX512 built in broadcast ability (1to8)
1216
+ */
1217
+ asm(
864
1218
"vxorpd %%zmm1, %%zmm1, %%zmm1\n"
865
1219
"vmovapd %%zmm1, %%zmm2\n"
866
1220
"vmovapd %%zmm1, %%zmm3\n"
0 commit comments