Skip to content

Commit 3c1f2c6

Browse files
authored
Fix Compile error (C2668) (#508)
* cmake: force MSVC compiler charset to utf-8 * build: apply MSVC /bigobj option to c/cpp files only * Update CMakeLists.txt * Fix Compile error (C2668) * revert hsum_float_8x8
1 parent fa90a98 commit 3c1f2c6

File tree

2 files changed

+18
-48
lines changed

2 files changed

+18
-48
lines changed

ggml/src/iqk/iqk_common.h

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,23 +225,38 @@ static inline int hsum_i32_8(const __m256i a) {
225225
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
226226
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
227227
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
228-
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
228+
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
229229
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
230230
}
231+
static inline float hmax_f32_8(__m256 x) {
232+
__m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
233+
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
234+
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
235+
return _mm_cvtss_f32(max4);
236+
}
231237
static inline float hmax_float_8(__m256 x) {
232238
__m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
233239
max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
234240
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
235241
return _mm_cvtss_f32(max4);
236242
}
237243

244+
static inline __m128 hsum_float_4x4(__m128 * accm) {
245+
accm[0] = _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[2]), _mm_unpackhi_ps(accm[0], accm[2]));
246+
accm[1] = _mm_add_ps(_mm_unpacklo_ps(accm[1], accm[3]), _mm_unpackhi_ps(accm[1], accm[3]));
247+
return _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[1]), _mm_unpackhi_ps(accm[0], accm[1]));
248+
}
238249
static inline __m256 hsum_float_8x8(__m256 * accm) {
239250
for (int i = 0; i < 4; ++i) {
240-
accm[i] = _mm256_add_ps(_mm256_permute2f128_ps(accm[i], accm[i+4], 0x20), _mm256_permute2f128_ps(accm[i], accm[i+4], 0x31));
251+
accm[i] = _mm256_add_ps(_mm256_permute2f128_ps(accm[i], accm[i + 4], 0x20), _mm256_permute2f128_ps(accm[i], accm[i + 4], 0x31));
241252
//accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)),
242253
// _mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1)));
243254
}
244-
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
255+
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i + 2]), _mm256_unpackhi_ps(accm[i], accm[i + 2]));
256+
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
257+
}
258+
static inline __m256 hsum_float_4x8(__m256 * accm) {
259+
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i + 2]), _mm256_unpackhi_ps(accm[i], accm[i + 2]));
245260
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
246261
}
247262

ggml/src/iqk/iqk_quantize.cpp

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -557,32 +557,6 @@ void quantize_row_q8_K64(const float * x, void * y, int64_t k) {
557557
quantize_row_q8_K64_ref(x, (block_q8_K64 *)y, k);
558558
}
559559

560-
#ifdef __AVX2__
561-
namespace {
562-
inline float hsum_float_4(__m128 x) {
563-
x = _mm_add_ps(x, _mm_movehl_ps(x, x));
564-
x = _mm_add_ss(x, _mm_movehdup_ps(x));
565-
return _mm_cvtss_f32(x);
566-
}
567-
inline float hsum_float_8(__m256 x) {
568-
return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
569-
}
570-
inline int hsum_i32_8(const __m256i a) {
571-
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
572-
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
573-
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
574-
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
575-
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
576-
}
577-
inline float hmax_f32_8(__m256 x) {
578-
__m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
579-
max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
580-
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
581-
return _mm_cvtss_f32(max4);
582-
}
583-
}
584-
#endif
585-
586560
void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) {
587561
float * dptr = (float *)vy;
588562
int8_t * qy = (int8_t *)(dptr + 5);
@@ -7413,25 +7387,6 @@ void dequantize_row_ms_i2s(const void * vx, float * y, int64_t k) {
74137387
}
74147388

74157389
namespace {
7416-
#ifdef __AVX2__
7417-
__m128 hsum_float_4x4(__m128 * accm) {
7418-
accm[0] = _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[2]), _mm_unpackhi_ps(accm[0], accm[2]));
7419-
accm[1] = _mm_add_ps(_mm_unpacklo_ps(accm[1], accm[3]), _mm_unpackhi_ps(accm[1], accm[3]));
7420-
return _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[1]), _mm_unpackhi_ps(accm[0], accm[1]));
7421-
}
7422-
__m256 hsum_float_8x8(__m256 * accm) {
7423-
for (int i = 0; i < 4; ++i) {
7424-
accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)),
7425-
_mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1)));
7426-
}
7427-
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
7428-
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
7429-
}
7430-
__m256 hsum_float_4x8(__m256 * accm) {
7431-
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
7432-
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
7433-
}
7434-
#endif
74357390
template <int block_size, int group_size, int num_bits, bool is_abs = false>
74367391
class QuantizerIQKT {
74377392
static_assert(group_size == 8 || group_size == 4);

0 commit comments

Comments
 (0)