@@ -1270,7 +1270,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12701270 const float d = y [i ].d * GGML_CPU_FP16_TO_FP32 (x [i ].d );
12711271 const float dmin = y [i ].d * GGML_CPU_FP16_TO_FP32 (x [i ].dmin );
12721272
1273- int tmp , tmp2 ;
12741273 float ftmp , ft2 ;
12751274 const uint8_t * restrict q40 ;
12761275 const uint8_t * restrict q41 ;
@@ -1778,23 +1777,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
17781777
17791778 const int8_t * restrict scale = x [i ].scales ;
17801779
1781- int sum_t = 0 ;
1782- int t0 ;
1780+ int q6h ;
1781+ float ftmp ;
17831782
17841783 for (int j = 0 ; j < QK_K /128 ; ++ j ) {
17851784 __asm__ __volatile__(
1785+ "addi %[q6h], %[q6], 32\n\t"
1786+ "ld t0, 0(%[scale])\n\t"
1787+ "addi %[scale], %[scale], 8\n\t"
1788+ "slli t6, t0, 1 * 8\n\t"
1789+ "lb zero, 0(%[q6])\n\t"
1790+ "slli t5, t0, 2 * 8\n\t"
1791+ "slli t4, t0, 3 * 8\n\t"
1792+ "lb zero, 0(%[q6h])\n\t"
1793+ "slli t3, t0, 4 * 8\n\t"
1794+ "slli t2, t0, 5 * 8\n\t"
1795+ "lb zero, 0(%[qh])\n\t"
1796+ "lb zero, 31(%[q6h])\n\t"
1797+ "slli t1, t0, 6 * 8\n\t"
1798+ "srai a7, t0, 56\n\t"
17861799 "vsetvli zero, %[vl32], e8, m2\n\t"
1800+ "vle8.v v8, (%[q6])\n\t"
1801+ "srai t6, t6, 56\n\t"
1802+ "srai t5, t5, 56\n\t"
1803+ "srai t4, t4, 56\n\t"
1804+ "srai t3, t3, 56\n\t"
1805+ "vle8.v v10, (%[q6h])\n\t"
1806+ "addi %[q6], %[q6], 64\n\t"
1807+ "slli t0, t0, 7 * 8\n\t"
1808+ "srai t2, t2, 56\n\t"
1809+ "srai t1, t1, 56\n\t"
1810+ "srai t0, t0, 56\n\t"
17871811 "vle8.v v4, (%[qh])\n\t"
1812+ "vsrl.vi v12, v8, 4\n\t"
1813+ "vsrl.vi v14, v10, 4\n\t"
1814+ "lb zero, 0(%[q8])\n\t"
1815+ "vand.vi v8, v8, 0xF\n\t"
1816+ "vand.vi v10, v10, 0xF\n\t"
1817+ "lb zero, 32(%[q8])\n\t"
17881818 "vsll.vi v0, v4, 4\n\t"
17891819 "vsll.vi v2, v4, 2\n\t"
1820+ "lb zero, 64(%[q8])\n\t"
17901821 "vsrl.vi v6, v4, 2\n\t"
1791- "vsetvli zero, %[vl64], e8, m4\n\t"
1792- "vle8.v v8, (%[q6])\n\t"
1793- "vsrl.vi v12, v8, 4\n\t"
1794- "vand.vi v8, v8, 0xF\n\t"
1795- "vsetvli zero, %[vl128], e8, m8\n\t"
17961822 "vand.vx v0, v0, %[mask]\n\t"
1823+ "lb zero, 96(%[q8])\n\t"
1824+ "vand.vx v2, v2, %[mask]\n\t"
1825+ "vand.vx v4, v4, %[mask]\n\t"
1826+ "vand.vx v6, v6, %[mask]\n\t"
17971827 "vor.vv v8, v8, v0\n\t"
1828+ "lb zero, 127(%[q8])\n\t"
1829+ "vor.vv v10, v10, v2\n\t"
1830+ "vor.vv v12, v12, v4\n\t"
1831+ "vor.vv v14, v14, v6\n\t"
1832+ "vsetvli zero, %[vl128], e8, m8\n\t"
17981833 "vle8.v v0, (%[q8])\n\t"
17991834 "vsub.vx v8, v8, %[vl32]\n\t"
18001835 "vsetvli zero, %[vl64], e8, m4\n\t"
@@ -1811,34 +1846,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
18111846 "vwredsum.vs v13, v28, v0\n\t"
18121847 "vwredsum.vs v14, v30, v0\n\t"
18131848 "vsetivli zero, 4, e32, m1\n\t"
1814- "vslideup.vi v10, v9, 1\n\t"
1815- "vslideup.vi v8, v7, 1\n\t"
1816- "vslideup.vi v11, v12, 1\n\t"
1817- "vslideup.vi v13, v14, 1\n\t"
1818- "vslideup.vi v10, v8, 2\n\t"
1819- "vslideup.vi v11, v13, 2\n\t"
1820- "vsetivli zero, 8, e32, m2\n\t"
1821- "vle8.v v2, (%[scale])\n\t"
1822- "vsext.vf4 v4, v2\n\t"
1823- "vmul.vv v2, v4, v10\n\t"
1824- "vredsum.vs v0, v2, v0\n\t"
1825- "vmv.x.s %[t0], v0\n\t"
1826- "add %[sumi], %[sumi], %[t0]"
1827- : [sumi ] "+&r" (sum_t ), [t0 ] "=&r" (t0 )
1828- : [qh ] "r" (qh ), [q6 ] "r" (q6 ), [q8 ] "r" (q8 ), [scale ] "r" (scale )
1849+ "vmul.vx v0, v10, t0\n\t"
1850+ "vmul.vx v1, v9, t1\n\t"
1851+ "vmacc.vx v0, t2, v8\n\t"
1852+ "vmacc.vx v1, t3, v7\n\t"
1853+ "vmacc.vx v0, t4, v11\n\t"
1854+ "vmacc.vx v1, t5, v12\n\t"
1855+ "vmacc.vx v0, t6, v13\n\t"
1856+ "vmacc.vx v1, a7, v14\n\t"
1857+ "vadd.vv v0, v0, v1\n\t"
1858+ "vfcvt.f.x.v v0, v0\n\t"
1859+ "vfmv.f.s %[ftmp], v0\n\t"
1860+ "fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
1861+ : [q6 ] "+&r" (q6 ), [q6h ] "=&r" (q6h )
1862+ , [scale ] "+&r" (scale )
1863+ , [sumf ] "+&f" (sumf ), [ftmp ] "=&f" (ftmp )
1864+ : [qh ] "r" (qh ), [q8 ] "r" (q8 )
18291865 , [vl32 ] "r" (32 ), [vl64 ] "r" (64 ), [vl128 ] "r" (128 )
1830- , [mask ] "r" (0x30 )
1866+ , [mask ] "r" (0x30 ), [ d ] "f" ( d )
18311867 : "memory"
18321868 , "v0" , "v1" , "v2" , "v3" , "v4" , "v5" , "v6" , "v7"
18331869 , "v8" , "v9" , "v10" , "v11" , "v12" , "v13" , "v14" , "v15"
18341870 , "v16" , "v17" , "v18" , "v19" , "v20" , "v21" , "v22" , "v23"
18351871 , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" , "v31"
1872+ , "t0" , "t1" , "t2" , "t3" , "t4" , "t5" , "t6" , "a7"
1873+ , "a6" , "a5" , "a4" , "a3"
18361874 );
1837- q6 += 64 ; qh += 32 ; q8 += 128 ; scale += 8 ;
1875+ qh += 32 ; q8 += 128 ;
18381876 }
1839-
1840- sumf += d * sum_t ;
1841-
18421877 }
18431878 break ;
18441879 default :
0 commit comments