Skip to content

Commit 10168ab

Browse files
author
Iwan Kawrakow
committed
q8_KV: slightly faster gemv on Zen4
1 parent 1ecea16 commit 10168ab

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

ggml/src/iqk/iqk_mul_mat.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6246,7 +6246,7 @@ static void mul_mat_q8_KV_q8_KV_1(int n, const void * vx, size_t bx, const DataI
62466246
GGML_ASSERT(nrc_x%8 == 0);
62476247
GGML_ASSERT(n%32 == 0);
62486248
__m256i qx[2];
6249-
__m256i acc[nrc_y] = {};
6249+
__m256i acc[2*nrc_y] = {};
62506250
float dy[nrc_y];
62516251
#ifdef HAVE_FANCY_SIMD
62526252
int32_t sy[nrc_y];
@@ -6279,10 +6279,10 @@ static void mul_mat_q8_KV_q8_KV_1(int n, const void * vx, size_t bx, const DataI
62796279
for (int iy = 0; iy < nrc_y; ++iy) {
62806280
for (int j = 0; j < 2; ++j) {
62816281
#ifdef HAVE_FANCY_SIMD
6282-
acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx[j], _mm256_loadu_si256((const __m256i *)q8y[iy] + 2*i + j));
6282+
acc[2*iy+j] = _mm256_dpbusd_epi32(acc[2*iy+j], qx[j], _mm256_loadu_si256((const __m256i *)q8y[iy] + 2*i + j));
62836283
#else
62846284
auto dot = _mm256_maddubs_epi16(sx[j], _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + 2*i + j), qx[j]));
6285-
acc[iy] = _mm256_add_epi32(acc[iy], _mm256_madd_epi16(m1, dot));
6285+
acc[2*iy+j] = _mm256_add_epi32(acc[2*iy+j], _mm256_madd_epi16(m1, dot));
62866286
#endif
62876287
}
62886288
}
@@ -6296,21 +6296,21 @@ static void mul_mat_q8_KV_q8_KV_1(int n, const void * vx, size_t bx, const DataI
62966296
#endif
62976297
for (int iy = 0; iy < nrc_y; ++iy) {
62986298
#ifdef HAVE_FANCY_SIMD
6299-
acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx[0], _mm256_loadu_si256((const __m256i *)q8y[iy] + i));
6299+
acc[2*iy] = _mm256_dpbusd_epi32(acc[2*iy], qx[0], _mm256_loadu_si256((const __m256i *)q8y[iy] + i));
63006300
#else
63016301
auto dot = _mm256_maddubs_epi16(sx[0], _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + i), qx[0]));
6302-
acc[iy] = _mm256_add_epi32(acc[iy], _mm256_madd_epi16(m1, dot));
6302+
acc[2*iy] = _mm256_add_epi32(acc[2*iy], _mm256_madd_epi16(m1, dot));
63036303
#endif
63046304
}
63056305
}
63066306
for (int iy = 0; iy < nrc_y; ++iy) {
6307-
auto sumi = hsum_i32_8(acc[iy]);
6307+
auto sumi = hsum_i32_8(_mm256_add_epi32(acc[2*iy], acc[2*iy+1]));
63086308
#ifdef HAVE_FANCY_SIMD
63096309
info.store(ix, iy, dx[0]*dy[iy]*(sumi+sy[iy]));
63106310
#else
63116311
info.store(ix, iy, dx[0]*dy[iy]*sumi);
63126312
#endif
6313-
acc[iy] = _mm256_setzero_si256();
6313+
acc[2*iy] = acc[2*iy+1] = _mm256_setzero_si256();
63146314
}
63156315
}
63166316
}

0 commit comments

Comments
 (0)