@@ -1272,8 +1272,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12721272
12731273            int  tmp , tmp2 ;
12741274            float  ftmp , ft2 ;
1275+             const  uint8_t  *  restrict q40 ;
1276+             const  uint8_t  *  restrict q41 ;
1277+             const  uint8_t  *  restrict q42 ;
1278+             const  uint8_t  *  restrict q43 ;
1279+             const  int8_t   *  restrict q80 ;
1280+             const  int8_t   *  restrict q81 ;
1281+             const  int8_t   *  restrict q82 ;
1282+             const  int8_t   *  restrict q83 ;
1283+             int  s0 , s1 , s2 , s3 ;
12751284
12761285            __asm__ __volatile__(
1286+                 "li %[s1], 8\n\t" 
12771287                "vsetivli zero, 4, e32, m1, ta, ma\n\t" 
12781288                "vle32.v v1, (%[s6b])\n\t" 
12791289                "vslide1down.vx v1, v1, zero\n\t" 
@@ -1287,14 +1297,13 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12871297                "vslide1up.vx v5, v4, zero\n\t"  // {0, 4} 
12881298                "vsrl.vi v6, v1, 6\n\t" 
12891299                "vsrl.vv v7, v2, v5\n\t" 
1300+                 "vsse32.v v8, (%[utmp]), %[s1]\n\t" 
12901301                "vand.vx v0, v6, %[kmask3]\n\t" 
12911302                "vand.vx v2, v7, %[kmask2]\n\t" 
12921303                "vsll.vi v6, v0, 4\n\t" 
1293-                 "li %[t2], 8\n\t" 
1294-                 "addi %[t1], %[utmp], 4\n\t" 
1304+                 "addi %[s0], %[utmp], 4\n\t" 
12951305                "vor.vv v1, v6, v2\n\t" 
1296-                 "vsse32.v v8, (%[utmp]), %[t2]\n\t" 
1297-                 "vsse32.v v1, (%[t1]), %[t2]\n\t" 
1306+                 "vsse32.v v1, (%[s0]), %[s1]\n\t" 
12981307                "vsetivli zero, 8, e16, m1, ta, ma\n\t" 
12991308                "vle32.v v2, (%[bsums])\n\t" 
13001309                "vnsrl.wi v0, v2, 0\n\t" 
@@ -1307,107 +1316,84 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
13071316                "vredsum.vs v0, v6, v16\n\t" 
13081317                "vredsum.vs v0, v7, v0\n\t" 
13091318                "vfcvt.f.x.v v0, v0\n\t" 
1310-                 "vfmv.f.s %[ftmp], v0" 
1311-                 : [t1 ] "=&r"  (tmp ), [t2 ] "=&r"  (tmp2 ), [ftmp ] "=&f"  (ftmp )
1312-                 : [bsums ] "r"  (y [i ].bsums ), [mins ] "r"  (mins ), [utmp ] "r"  (utmp )
1313-                 , [s6b ] "r"  (& x [i ]), [kmask1 ] "r"  (kmask1 )
1314-                 , [kmask2 ] "r"  (kmask2 ), [kmask3 ] "r"  (kmask3 )
1315-                 : "memory" 
1316-                 , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" 
1317-                 , "v8" , "v9" , "v10" , "v11" , "v12" , "v13" , "v14" , "v15" 
1318-                 , "v16" , "v17" , "v18" , "v19" , "v20" , "v21" , "v22" , "v23" 
1319-                 , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" , "v31" 
1320-             );
1321-             sumf  -=  dmin  *  ftmp ;
1322- 
1323-             const  uint8_t  *  restrict q40  =  x [i ].qs  +  0 ;
1324-             const  uint8_t  *  restrict q41  =  x [i ].qs  +  16 ;
1325-             const  uint8_t  *  restrict q42  =  x [i ].qs  +  32 ;
1326-             const  uint8_t  *  restrict q43  =  x [i ].qs  +  48 ;
1327-             const  int8_t   *  restrict q80 ;
1328-             const  int8_t   *  restrict q81 ;
1329-             const  int8_t   *  restrict q82 ;
1330-             const  int8_t   *  restrict q83 ;
1331- 
1332-             ftmp  =  0 ;
1333-             const  uint8_t  *  scale  =  scales ;
1334- 
1335-             int  s0 , s1 , s2 , s3 ;
1336-             __asm__ __volatile__(
1319+                 "vfmv.f.s %[ftmp], v0\n\t" 
13371320                "vsetivli zero, 16, e8, m1, ta, ma\n\t" 
1338-                 "vle8.v v0, (%[q40])\n\t" 
1339-                 "addi %[q80], %[ys], 0\n\t" 
1340-                 "addi %[q40], %[q40], 64\n\t" 
1321+                 "vle8.v v0, (%[xs])\n\t" 
1322+                 "fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t" 
1323+                 "addi %[q40], %[xs], 64\n\t" 
1324+                 "addi %[q41], %[xs], 16\n\t" 
1325+                 "addi %[q42], %[xs], 32\n\t" 
1326+                 "addi %[q43], %[xs], 48\n\t" 
1327+                 "addi %[q80], %[ys], 64\n\t" 
13411328                "vle8.v v1, (%[q41])\n\t" 
1329+                 "vle8.v v2, (%[q42])\n\t" 
13421330                "addi %[q81], %[ys], 16\n\t" 
13431331                "addi %[q41], %[q41], 64\n\t" 
1344-                 "vle8.v v2, (%[q42])\n\t" 
13451332                "addi %[q82], %[ys], 32\n\t" 
1346-                 "addi %[q42], %[q42], 64\n\t" 
13471333                "vle8.v v3, (%[q43])\n\t" 
1334+                 "vle8.v v8, (%[ys])\n\t" 
1335+                 "addi %[q42], %[q42], 64\n\t" 
13481336                "addi %[q83], %[ys], 48\n\t" 
13491337                "addi %[q43], %[q43], 64\n\t" 
1350-                 "vle8.v v8, (%[q80])\n\t" 
13511338                "vsrl.vi v4, v0, 4\n\t" 
1352-                 "addi %[q80], %[q80], 64\n\t" 
13531339                "vle8.v v9, (%[q81])\n\t" 
1340+                 "vle8.v v10, (%[q82])\n\t" 
13541341                "vand.vi v0, v0, 0xF\n\t" 
13551342                "addi %[q81], %[q81], 64\n\t" 
1356-                 "vle8.v v10, (%[q82])\n\t" 
13571343                "vsrl.vi v5, v1, 4\n\t" 
13581344                "addi %[q82], %[q82], 64\n\t" 
13591345                "vle8.v v11, (%[q83])\n\t" 
1346+                 "vle8.v v12, (%[q80])\n\t" 
13601347                "vand.vi v1, v1, 0xF\n\t" 
13611348                "addi %[q83], %[q83], 64\n\t" 
1362-                 "vle8.v v12, (%[q80])\n\t" 
13631349                "vsrl.vi v6, v2, 4\n\t" 
13641350                "addi %[q80], %[q80], 64\n\t" 
13651351                "vle8.v v13, (%[q81])\n\t" 
1352+                 "vle8.v v14, (%[q82])\n\t" 
13661353                "vand.vi v2, v2, 0xF\n\t" 
13671354                "addi %[q81], %[q81], 64\n\t" 
1368-                 "vle8.v v14, (%[q82])\n\t" 
13691355                "vsrl.vi v7, v3, 4\n\t" 
13701356                "addi %[q82], %[q82], 64\n\t" 
1357+                 "vwmul.vv v16, v0, v8\n\t" 
13711358                "vle8.v v15, (%[q83])\n\t" 
1359+                 "vle8.v v0, (%[q40])\n\t" 
13721360                "vand.vi v3, v3, 0xF\n\t" 
13731361                "addi %[q83], %[q83], 64\n\t" 
1374-                 "vwmul.vv v16, v0, v8\n\t" 
13751362                "vwmul.vv v24, v2, v12\n\t" 
13761363                "vwmul.vv v20, v4, v10\n\t" 
13771364                "vwmul.vv v28, v6, v14\n\t" 
13781365                "vwmacc.vv v16, v1, v9\n\t" 
1366+                 "vle8.v v1, (%[q41])\n\t" 
1367+                 "vle8.v v2, (%[q42])\n\t" 
13791368                "vwmacc.vv v24, v3, v13\n\t" 
13801369                "vwmacc.vv v20, v5, v11\n\t" 
13811370                "vwmacc.vv v28, v7, v15\n\t" 
1382-                 "vle8.v v0, (%[q40])\n\t" 
13831371                "addi %[q40], %[q80], 64\n\t" 
1384-                 "vle8.v v1, (%[q41])\n\t" 
13851372                "addi %[q41], %[q81], 64\n\t" 
1386-                 "vle8.v v2, (%[q42])\n\t" 
1387-                 "addi %[q42], %[q82], 64\n\t" 
13881373                "vle8.v v3, (%[q43])\n\t" 
1389-                 "addi %[q43], %[q83], 64\n\t" 
13901374                "vle8.v v8, (%[q80])\n\t" 
1375+                 "addi %[q42], %[q82], 64\n\t" 
1376+                 "addi %[q43], %[q83], 64\n\t" 
13911377                "vsrl.vi v4, v0, 4\n\t" 
13921378                "vle8.v v9, (%[q81])\n\t" 
1393-                 "vand.vi v0, v0, 0xF\n\t" 
13941379                "vle8.v v10, (%[q82])\n\t" 
1380+                 "vand.vi v0, v0, 0xF\n\t" 
13951381                "vsrl.vi v5, v1, 4\n\t" 
1382+                 "vsrl.vi v7, v3, 4\n\t" 
1383+                 "vand.vi v3, v3, 0xF\n\t" 
13961384                "vle8.v v11, (%[q83])\n\t" 
1397-                 "vand.vi v1, v1, 0xF\n\t" 
13981385                "vle8.v v12, (%[q40])\n\t" 
1386+                 "vand.vi v1, v1, 0xF\n\t" 
13991387                "vsrl.vi v6, v2, 4\n\t" 
1400-                 "vle8.v v13, (%[q41])\n\t" 
14011388                "vand.vi v2, v2, 0xF\n\t" 
1402-                 "vle8.v v14, (%[q42])\n\t" 
1403-                 "vsrl.vi v7, v3, 4\n\t" 
1404-                 "vle8.v v15, (%[q43])\n\t" 
1405-                 "vand.vi v3, v3, 0xF\n\t" 
14061389                "vwmul.vv v18, v0, v8\n\t" 
1390+                 "vle8.v v13, (%[q41])\n\t" 
1391+                 "vle8.v v14, (%[q42])\n\t" 
14071392                "vwmul.vv v26, v2, v12\n\t" 
14081393                "vwmul.vv v22, v4, v10\n\t" 
14091394                "vwmul.vv v30, v6, v14\n\t" 
14101395                "vwmacc.vv v18, v1, v9\n\t" 
1396+                 "vle8.v v15, (%[q43])\n\t" 
14111397                "vwmacc.vv v26, v3, v13\n\t" 
14121398                "vwmacc.vv v22, v5, v11\n\t" 
14131399                "vwmacc.vv v30, v7, v15\n\t" 
@@ -1444,12 +1430,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
14441430                "vfmv.f.s %[ftmp], v1\n\t" 
14451431                "fadd.s %[ft2], %[ft2], %[ftmp]\n\t" 
14461432                "fmadd.s %[sumf], %[d], %[ft2], %[sumf]" 
1447-                 : [tmp ]  "=&r"  ( tmp ), [ ftmp ] "=&f"  (ftmp ), [sumf ] "+&f"  (sumf ), [ft2 ] "=&f"  (ft2 )
1433+                 : [ftmp ] "=&f"  (ftmp ), [sumf ] "+&f"  (sumf ), [ft2 ] "=&f"  (ft2 )
14481434                , [s0 ] "=&r"  (s0 ), [s1 ] "=&r"  (s1 ), [s2 ] "=&r"  (s2 ), [s3 ] "=&r"  (s3 )
1449-                 , [q40 ] "+ &r"  (q40 ), [q41 ] "+ &r"  (q41 ), [q42 ] "+ &r"  (q42 ), [q43 ] "+ &r"  (q43 )
1435+                 , [q40 ] "= &r"  (q40 ), [q41 ] "= &r"  (q41 ), [q42 ] "= &r"  (q42 ), [q43 ] "= &r"  (q43 )
14501436                , [q80 ] "=&r"  (q80 ), [q81 ] "=&r"  (q81 ), [q82 ] "=&r"  (q82 ), [q83 ] "=&r"  (q83 )
1451-                 , [scale ] "+&r"  (scale )
1452-                 : [d ] "f"  (d ), [ys ] "r"  (y [i ].qs )
1437+                 : [d ] "f"  (d ), [ys ] "r"  (y [i ].qs ), [xs ] "r"  (x [i ].qs ), [scale ] "r"  (scales )
1438+                 , [bsums ] "r"  (y [i ].bsums ), [mins ] "r"  (mins ), [utmp ] "r"  (utmp )
1439+                 , [s6b ] "r"  (& x [i ]), [kmask1 ] "r"  (kmask1 ), [dmin ] "f"  (dmin )
1440+                 , [kmask2 ] "r"  (kmask2 ), [kmask3 ] "r"  (kmask3 )
14531441                : "memory" 
14541442                , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7" 
14551443                , "v8" , "v9" , "v10" , "v11" , "v12" , "v13" , "v14" , "v15" 
0 commit comments