Convergence IKL > Croco

Nexesenex · Nexesenex · commit dc650a0219d9 · 2025-06-19T00:33:35.000+02:00
Forgotten changes, more convergence &amp; updated refs

Harmonize some refs with GGML_API

More refs and less warnings
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -600,12 +600,6 @@ typedef struct {
 } block_iq1_m;
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
 
-// Used by IQ1_M quants
-typedef union {
-    ggml_half f16;
-    uint16_t  u16;
-} iq1m_scale_t;
-
 // 1.75 bpw - blocks of 32 with 4 interleaved rows = 128 quants
 typedef struct {
     uint8_t  qs[16];     // grid index, low 8 bits
@@ -633,10 +627,10 @@ typedef struct {
 static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding");
 
 // Used by IQ1_M quants
-/* typedef union {
+typedef union {
     ggml_half f16;
     uint16_t  u16;
-} iq1m_scale_t; */
+} iq1m_scale_t;
 
 // Non-linear quants
 #define QK4_NL 32
@@ -718,6 +712,24 @@ typedef struct {
 } block_iq2_ks;
 static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");
 
+typedef struct {
+    uint8_t  scales[QK_K/64];
+    uint8_t  ql[QK_K/4];
+} block_iq2_kt;
+static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding");
+
+typedef struct {
+    uint8_t  scales[QK_K/64];
+    uint8_t  ql[QK_K/4];
+    uint8_t  qh[QK_K/8];
+} block_iq3_kt;
+static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding");
+
+typedef struct {
+    uint32_t qs[QK_K/8];
+} block_iq4_kt;
+static_assert(sizeof(block_iq4_kt) == QK_K/2, "wrong iq4_kt block size/padding");
+
 typedef struct {
     ggml_half d;
     uint16_t extra;
@@ -806,25 +818,6 @@ typedef struct {
 } block_iq5_ks_r4;
 static_assert(sizeof(block_iq5_ks_r4) == 4*sizeof(block_iq5_ks), "wrong iq5_ks_r4 block size/padding");
 
-// IQ_KT
-
-typedef struct {
-    uint8_t  scales[QK_K/64];
-    uint8_t  ql[QK_K/4];
-} block_iq2_kt;
-static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding");
-
-typedef struct {
-    uint8_t  scales[QK_K/64];
-    uint8_t  ql[QK_K/4];
-    uint8_t  qh[QK_K/8];
-} block_iq3_kt;
-static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding");
-
-typedef struct {
-    uint32_t qs[QK_K/8];
-} block_iq4_kt;
-static_assert(sizeof(block_iq4_kt) == QK_K/2, "wrong iq4_kt block size/padding");
 
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
@@ -2262,7 +2255,6 @@ GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)
 GGML_TABLE_END()
 #endif
 
-
 //IK
 GGML_TABLE_BEGIN(int8_t, iq2nl_values, 8)
     -31, -13, 1, 17,   -26, -8, 6, 22
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -54,6 +54,12 @@
 #define GROUP_MAX_EPS_IQ1_M 1e-7f
 #define GROUP_MAX_EPS_IQ1_S 1e-12f
 
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid warnings for hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
 #define UNUSED GGML_UNUSED
 
 // reference implementation for deterministic creation of model files
@@ -450,8 +456,6 @@ void dequantize_row_q6_0(const block_q6_0 * GGML_RESTRICT x, float * GGML_RESTRI
     }
 }
 
-// void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
-
 void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK8_0;
 
@@ -814,6 +818,42 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST
 
     const float q4scale = 15.f;
 
+    // Detect TriNet
+    {
+        int n = k;
+        float max = 0;
+        for (int j = 0; j < n; ++j) {
+            float ax = fabsf(x[j]);
+            max = MAX(max, ax);
+        }
+        float mse0 = 0, mse = 0;
+        for (int j = 0; j < n; ++j) {
+            int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1;
+            mse0 += x[j]*x[j];
+            float diff = x[j] - max*l;
+            mse += diff*diff;
+        }
+        if (mse < 0.1f*mse0) {
+            // yes, most likely trinet
+            for (int ibl = 0; ibl < nb; ++ibl) {
+                y[ibl].d = GGML_FP32_TO_FP16(max);
+                y[ibl].dmin = GGML_FP32_TO_FP16(max);
+                for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4);
+                const float * xb = x + QK_K * ibl;
+                for (int j = 0; j < QK_K; ++j) {
+                    L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2;
+                }
+                uint8_t * qs = y[ibl].qs;
+                for (int j = 0; j < QK_K; j += 128) {
+                    for (int l = 0; l < 32; ++l) {
+                        qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+                    }
+                    qs += 32;
+                }
+            }
+            return;
+        }
+    }
     for (int i = 0; i < nb; i++) {
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
@@ -33,7 +33,7 @@ GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K *
 GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-// GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
 
 GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
@@ -278,7 +278,7 @@ GGML_API size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RES
 GGML_API void dequantize_row_iq3_ks(const block_iq3_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void vec_dot_iq3_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
-void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+GGML_API void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 GGML_API void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q8_K128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -289,21 +289,21 @@ GGML_API void quantize_row_q8_0_x4(const float * GGML_RESTRICT x, void * GGML_RE
 GGML_API void quantize_row_q8_1_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q8_2_x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
-void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
-void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
+GGML_API void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
+GGML_API void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
 
-void iqk_repack_tensor(struct ggml_tensor * tensor);
-bool iqk_modify_tensor(struct ggml_tensor * tensor);
+GGML_API void iqk_repack_tensor(struct ggml_tensor * tensor);
+GGML_API bool iqk_modify_tensor(struct ggml_tensor * tensor);
 
-int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h
-bool iqk_should_modify_tensor(const struct ggml_tensor * tensor);
+GGML_API int iqk_repacked_type(const struct ggml_tensor * tensor); // int instead of ggml_type so we don't need to include ggml.h
+GGML_API bool iqk_should_modify_tensor(const struct ggml_tensor * tensor);
 
 // So we can re-pack Microsoft's BitNet I2_S quants
 GGML_API void dequantize_row_ms_i2s(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 
 typedef void (*to_float_t)  (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 typedef void (*from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-void iqk_quantize_any(int from_type, int to_type,
+GGML_API void iqk_quantize_any(int from_type, int to_type,
                       int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3,
                       uint64_t nb0, uint64_t nb1, uint64_t nb2, uint64_t nb3,
                       const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer,