Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1153,7 +1153,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_iq3_s,
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
#ifdef __AVX2__
.vec_dot_type = GGML_TYPE_Q8_2_X4,
#else
.vec_dot_type = GGML_TYPE_Q8_K,
#endif
.nrows = 1,
.row_meta_size = 0,
},
Expand Down
81 changes: 78 additions & 3 deletions ggml/src/iqk/iqk_gemm_iquants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,10 @@ struct SignHelper {
return _mm256_sign_epi8(value, make_signs(sign_bits[0] | (sign_bits[1] << 16)));
#endif
}
inline void sign_4_values(const uint16_t * sign_bits, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
IQK_ALWAYS_INLINE void sign_4_values(const uint16_t * sign_bits, __m256i * values) const {
// Somehow the FANCY_SIMD version has become 50% slower for TG???
#ifdef z_HAVE_FANCY_SIMD
//__mmask32 mask[4]; std::memcpy(mask, sign_bits, 4*sizeof(__mmask32));
const __mmask32 * mask = (const __mmask32 *)sign_bits;
values[0] = _mm256_mask_sub_epi8(values[0], mask[0], _mm256_setzero_si256(), values[0]);
values[1] = _mm256_mask_sub_epi8(values[1], mask[1], _mm256_setzero_si256(), values[1]);
Expand Down Expand Up @@ -534,7 +536,7 @@ struct DequantizerIQ3XXS final : public BaseDequantizer<block_iq3_xxs> {

};

#ifdef HAVE_FANCY_SIMD
#ifdef z_HAVE_FANCY_SIMD
// Strangely enough, the following implementation makes PP ~6% slower and TG ~6% faster
// compared to the vanilla AVX2 version below.
struct IndexHelperIQ3S {
Expand Down Expand Up @@ -597,6 +599,15 @@ struct DequantizerIQ3S final : public BaseDequantizer<block_iq3_s> {
auto scales16 = make_scales(i, d);
scales[0] = MM256_SET_M128I(scales16, scales16);
}
inline void new_block_f(int i, __m256 * scales) {
auto sc16 = make_scales(i, d);
auto scf = _mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(sc16)));
auto scf_l = _mm256_castps256_ps128(scf);
auto scf_h = _mm256_extractf128_ps(scf, 1);
scales[0] = _mm256_set_m128(scf_l, scf_l);
scales[1] = _mm256_set_m128(scf_h, scf_h);
scales[2] = _mm256_mul_ps(scf, _mm256_set1_ps(-minv));
}
inline float new_block(int i, __m256i * scales, __m256i& mins) {
auto scales16 = make_scales(i, d);
mins = scb.shuffle(scales16);
Expand Down Expand Up @@ -1832,6 +1843,60 @@ void iqk_convert_iq3_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, i
}
}

void iqk_convert_iq3_s_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);

int nb = n/QK_K;

const block_iq3_s * x8[8];

block_q8_0_r8 * y = (block_q8_0_r8 *)vy;

ggml_half dh[8];
uint16_t all_ls[64];
SignHelper sh;
IndexHelperIQ3S helper;

uint32_t block[8];
__m256i values[8];

for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_s *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
dh[k] = x8[k][i].d;
auto qs = x8[k][i].qs;
auto qh = x8[k][i].qh;
auto signs = (const uint16_t *)x8[k][i].signs;
helper.make2(qs+ 0, qh+0, values+0);
helper.make2(qs+16, qh+2, values+2);
sh.sign_4_values(signs+0, values+0);
helper.make2(qs+32, qh+4, values+4);
helper.make2(qs+48, qh+6, values+6);
sh.sign_4_values(signs+8, values+4);
for (int ib32 = 0; ib32 < 8; ++ib32) {
all_ls[8*ib32 + k] = (2*((x8[k][i].scales[ib32/2] >> 4*(ib32%2)) & 0xf) + 1);
_mm256_storeu_si256((__m256i *)block, values[ib32]);
auto q8 = (uint32_t *)y[ib32].qs;
for (int l = 0; l < 4; ++l) {
q8[8*l + k + 0] = block[l + 0];
q8[8*l + k + 32] = block[l + 4];
}
}
}
auto vd = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh));
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
auto iscales16 = _mm_loadu_si128((const __m128i *)all_ls + ib32);
auto iscales32 = _mm256_cvtepi16_epi32(iscales16);
auto scales = _mm256_mul_ps(vd, _mm256_cvtepi32_ps(iscales32));
_mm_storeu_si128((__m128i *)y[ib32].d, _mm256_cvtps_ph(scales, _MM_FROUND_TO_NEAREST_INT));
}
y += QK_K/32;
}
}
}

template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX_NY>& funcs) {
funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
Expand Down Expand Up @@ -1867,6 +1932,15 @@ bool iqk_set_kernels_iquants(int ne00, int typeA, int typeB, std::array<mul_mat_
return false;
}

if (ggml_type(typeA) == GGML_TYPE_IQ3_S) {
if (ggml_type(typeB) == GGML_TYPE_Q8_2_X4) {
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_K_q8_2_IQ_N, DequantizerIQ3S, kernels);
func16 = nullptr;
return true;
}
return false;
}

if (ggml_type(typeB) != GGML_TYPE_Q8_K) {
return false;
}
Expand Down Expand Up @@ -1926,6 +2000,7 @@ bool iqk_convert_iquants_q80_r8(int type, int n, const void * vx, size_t bx, voi
switch (ggml_type(type)) {
case GGML_TYPE_IQ2_XXS: iqk_convert_iq2_xxs_q8_0_r8(n, vx, bx, vy, nrc_x); break;
case GGML_TYPE_IQ3_XXS: iqk_convert_iq3_xxs_q8_0_r8(n, vx, bx, vy, nrc_x); break;
case GGML_TYPE_IQ3_S : iqk_convert_iq3_s_q8_0_r8 (n, vx, bx, vy, nrc_x); break;
default: return false;
}
return true;
Expand Down
1 change: 1 addition & 0 deletions ggml/src/iqk/iqk_mul_mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ struct MulMat {
case GGML_TYPE_IQ4_KT : return nrc_y >= 32 ? GGML_TYPE_F32 : type;
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
default: break;
}
Expand Down