Skip to content

Commit dc650a0

Browse files
committed
Convergence IKL > Croco
Forgotten changes, more convergence & updated refs Harmonize some refs with GGML_API More refs and less warnings
1 parent a5aa1f3 commit dc650a0

File tree

4 files changed

+71
-39
lines changed

4 files changed

+71
-39
lines changed

ggml/src/ggml-common.h

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -600,12 +600,6 @@ typedef struct {
600600
} block_iq1_m;
601601
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
602602

603-
// Used by IQ1_M quants
604-
typedef union {
605-
ggml_half f16;
606-
uint16_t u16;
607-
} iq1m_scale_t;
608-
609603
// 1.75 bpw - blocks of 32 with 4 interleaved rows = 128 quants
610604
typedef struct {
611605
uint8_t qs[16]; // grid index, low 8 bits
@@ -633,10 +627,10 @@ typedef struct {
633627
static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
634628

635629
// Used by IQ1_M quants
636-
/* typedef union {
630+
typedef union {
637631
ggml_half f16;
638632
uint16_t u16;
639-
} iq1m_scale_t; */
633+
} iq1m_scale_t;
640634

641635
// Non-linear quants
642636
#define QK4_NL 32
@@ -718,6 +712,24 @@ typedef struct {
718712
} block_iq2_ks;
719713
static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");
720714

715+
typedef struct {
716+
uint8_t scales[QK_K/64];
717+
uint8_t ql[QK_K/4];
718+
} block_iq2_kt;
719+
static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding");
720+
721+
typedef struct {
722+
uint8_t scales[QK_K/64];
723+
uint8_t ql[QK_K/4];
724+
uint8_t qh[QK_K/8];
725+
} block_iq3_kt;
726+
static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding");
727+
728+
typedef struct {
729+
uint32_t qs[QK_K/8];
730+
} block_iq4_kt;
731+
static_assert(sizeof(block_iq4_kt) == QK_K/2, "wrong iq4_kt block size/padding");
732+
721733
typedef struct {
722734
ggml_half d;
723735
uint16_t extra;
@@ -806,25 +818,6 @@ typedef struct {
806818
} block_iq5_ks_r4;
807819
static_assert(sizeof(block_iq5_ks_r4) == 4*sizeof(block_iq5_ks), "wrong iq5_ks_r4 block size/padding");
808820

809-
// IQ_KT
810-
811-
typedef struct {
812-
uint8_t scales[QK_K/64];
813-
uint8_t ql[QK_K/4];
814-
} block_iq2_kt;
815-
static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding");
816-
817-
typedef struct {
818-
uint8_t scales[QK_K/64];
819-
uint8_t ql[QK_K/4];
820-
uint8_t qh[QK_K/8];
821-
} block_iq3_kt;
822-
static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding");
823-
824-
typedef struct {
825-
uint32_t qs[QK_K/8];
826-
} block_iq4_kt;
827-
static_assert(sizeof(block_iq4_kt) == QK_K/2, "wrong iq4_kt block size/padding");
828821

829822
#endif // GGML_COMMON_DECL
830823
#endif // GGML_COMMON_DECL
@@ -2262,7 +2255,6 @@ GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)
22622255
GGML_TABLE_END()
22632256
#endif
22642257

2265-
22662258
//IK
22672259
GGML_TABLE_BEGIN(int8_t, iq2nl_values, 8)
22682260
-31, -13, 1, 17, -26, -8, 6, 22

ggml/src/ggml-quants.c

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@
5454
#define GROUP_MAX_EPS_IQ1_M 1e-7f
5555
#define GROUP_MAX_EPS_IQ1_S 1e-12f
5656

57+
#if defined(_MSC_VER)
58+
// disable "possible loss of data" to avoid warnings for hundreds of casts
59+
// we should just be careful :)
60+
#pragma warning(disable: 4244 4267)
61+
#endif
62+
5763
#define UNUSED GGML_UNUSED
5864

5965
// reference implementation for deterministic creation of model files
@@ -450,8 +456,6 @@ void dequantize_row_q6_0(const block_q6_0 * GGML_RESTRICT x, float * GGML_RESTRI
450456
}
451457
}
452458

453-
// void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
454-
455459
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
456460
static const int qk = QK8_0;
457461

@@ -814,6 +818,42 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST
814818

815819
const float q4scale = 15.f;
816820

821+
// Detect TriNet
822+
{
823+
int n = k;
824+
float max = 0;
825+
for (int j = 0; j < n; ++j) {
826+
float ax = fabsf(x[j]);
827+
max = MAX(max, ax);
828+
}
829+
float mse0 = 0, mse = 0;
830+
for (int j = 0; j < n; ++j) {
831+
int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1;
832+
mse0 += x[j]*x[j];
833+
float diff = x[j] - max*l;
834+
mse += diff*diff;
835+
}
836+
if (mse < 0.1f*mse0) {
837+
// yes, most likely trinet
838+
for (int ibl = 0; ibl < nb; ++ibl) {
839+
y[ibl].d = GGML_FP32_TO_FP16(max);
840+
y[ibl].dmin = GGML_FP32_TO_FP16(max);
841+
for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4);
842+
const float * xb = x + QK_K * ibl;
843+
for (int j = 0; j < QK_K; ++j) {
844+
L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2;
845+
}
846+
uint8_t * qs = y[ibl].qs;
847+
for (int j = 0; j < QK_K; j += 128) {
848+
for (int l = 0; l < 32; ++l) {
849+
qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
850+
}
851+
qs += 32;
852+
}
853+
}
854+
return;
855+
}
856+
}
817857
for (int i = 0; i < nb; i++) {
818858
float max_scale = 0; // as we are deducting the min, scales are always positive
819859
float max_min = 0;

ggml/src/ggml-quants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K *
3333
GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
3434
GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
3535
GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
36-
// GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
36+
GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
3737

3838
GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
3939
GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);

ggml/src/iqk/iqk_quantize.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ GGML_API size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RES
278278
GGML_API void dequantize_row_iq3_ks(const block_iq3_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
279279
GGML_API void vec_dot_iq3_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
280280

281-
void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
281+
GGML_API void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
282282
GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
283283
GGML_API void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
284284
GGML_API void quantize_row_q8_K128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -289,21 +289,21 @@ GGML_API void quantize_row_q8_0_x4(const float * GGML_RESTRICT x, void * GGML_RE
289289
GGML_API void quantize_row_q8_1_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
290290
GGML_API void quantize_row_q8_2_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
291291

292-
void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
293-
void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
292+
GGML_API void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
293+
GGML_API void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
294294

295-
void iqk_repack_tensor(struct ggml_tensor * tensor);
296-
bool iqk_modify_tensor(struct ggml_tensor * tensor);
295+
GGML_API void iqk_repack_tensor(struct ggml_tensor * tensor);
296+
GGML_API bool iqk_modify_tensor(struct ggml_tensor * tensor);
297297

298-
int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h
299-
bool iqk_should_modify_tensor(const struct ggml_tensor * tensor);
298+
GGML_API int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h
299+
GGML_API bool iqk_should_modify_tensor(const struct ggml_tensor * tensor);
300300

301301
// So we can re-pack Microsoft's BitNet I2_S quants
302302
GGML_API void dequantize_row_ms_i2s(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
303303

304304
typedef void (*to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
305305
typedef void (*from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
306-
void iqk_quantize_any(int from_type, int to_type,
306+
GGML_API void iqk_quantize_any(int from_type, int to_type,
307307
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3,
308308
uint64_t nb0, uint64_t nb1, uint64_t nb2, uint64_t nb3,
309309
const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer,

0 commit comments

Comments
 (0)