implement E8M0_TO_FP32_ANY for better e

horasal · horasal · commit 37e43acc0dfa · 2025-10-28T19:18:04.000+09:00
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -861,7 +861,7 @@ void ggml_vec_dot_mxfp6_e3m2_q8_0(int n, float * GGML_RESTRICT s, size_t bs, con
         const int16_t* kvalues = (const int16_t*)kvalues_mxfp6_e3m2;
 
         for (int i = 0; i < nb; ++i) {
-            const float d = GGML_CPU_FP16_TO_FP32(y[i].d) * GGML_E8M0_TO_FP32_HALF(x[i].e);
+            const float d = GGML_CPU_FP16_TO_FP32(y[i].d) * GGML_E8M0_TO_FP32_ANY(x[i].e, 4);
 
             const __m256i q8_v = _mm256_loadu_si256((const __m256i*)y[i].qs);
 
@@ -932,7 +932,7 @@ void ggml_vec_dot_mxfp6_e2m3_q8_0(int n, float * GGML_RESTRICT s, size_t bs, con
     const int16_t* kvalues = (const int16_t*)kvalues_mxfp6_e2m3;
 
     for (int i = 0; i < nb; ++i) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[i].d) * GGML_E8M0_TO_FP32_HALF(x[i].e);
+        const float d = GGML_CPU_FP16_TO_FP32(y[i].d) * GGML_E8M0_TO_FP32_ANY(x[i].e, 3);
 
         const __m256i q8_v = _mm256_loadu_si256((const __m256i*)y[i].qs);
 
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
@@ -243,7 +243,7 @@ void ggml_vec_dot_mxfp6_e3m2_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     float sumf = 0;
 
     for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_ANY(x[ib].e, 4);
         int sumi = 0;
         // Q8_0 (y) * MXFP6 (block_size = 32)
         for (int j = 0; j < QK_MXFP6_E3M2/4; ++j) {
@@ -294,7 +294,7 @@ void ggml_vec_dot_mxfp6_e2m3_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     float sumf = 0;
 
     for (; ib < nb; ++ib) {
-        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_ANY(x[ib].e, 3);
         int sumi = 0;
         // Q8_0 (y) * MXFP6 (block_size = 32)
         for (int j = 0; j < QK_MXFP6_E2M3/4; ++j) {
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
@@ -513,10 +513,10 @@ static __global__ void dequantize_block_mxfp6_e3m2(const void * __restrict__ vx,
         const uint8_t v2_idx = (b1 >> 4) | ((b2 & 0x03) << 4);
         const uint8_t v3_idx = b2 >> 2;
 
-        y[y_offset + 0] = d * kvalues_mxfp6_e3m2[v0_idx]*0.0625f;
-        y[y_offset + 1] = d * kvalues_mxfp6_e3m2[v1_idx]*0.0625f;
-        y[y_offset + 2] = d * kvalues_mxfp6_e3m2[v2_idx]*0.0625f;
-        y[y_offset + 3] = d * kvalues_mxfp6_e3m2[v3_idx]*0.0625f;
+        y[y_offset + 0] = d * kvalues_mxfp6_e3m2[v0_idx]*MXFP6_E3M2_SCALER;
+        y[y_offset + 1] = d * kvalues_mxfp6_e3m2[v1_idx]*MXFP6_E3M2_SCALER;
+        y[y_offset + 2] = d * kvalues_mxfp6_e3m2[v2_idx]*MXFP6_E3M2_SCALER;
+        y[y_offset + 3] = d * kvalues_mxfp6_e3m2[v3_idx]*MXFP6_E3M2_SCALER;
     }
 }
 
@@ -552,11 +552,10 @@ static __global__ void dequantize_block_mxfp6_e2m3(const void * __restrict__ vx,
         const uint8_t v2_idx = (b1 >> 4) | ((b2 & 0x03) << 4);
         const uint8_t v3_idx = b2 >> 2;
 
-        // Is this correct?
-        y[y_offset + 0] = d * kvalues_mxfp6_e2m3[v0_idx]*0.0625f;
-        y[y_offset + 1] = d * kvalues_mxfp6_e2m3[v1_idx]*0.0625f;
-        y[y_offset + 2] = d * kvalues_mxfp6_e2m3[v2_idx]*0.0625f;
-        y[y_offset + 3] = d * kvalues_mxfp6_e2m3[v3_idx]*0.0625f;
+        y[y_offset + 0] = d * kvalues_mxfp6_e2m3[v0_idx]*MXFP6_E2M3_SCALER;
+        y[y_offset + 1] = d * kvalues_mxfp6_e2m3[v1_idx]*MXFP6_E2M3_SCALER;
+        y[y_offset + 2] = d * kvalues_mxfp6_e2m3[v2_idx]*MXFP6_E2M3_SCALER;
+        y[y_offset + 3] = d * kvalues_mxfp6_e2m3[v3_idx]*MXFP6_E2M3_SCALER;
     }
 }
 
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -488,8 +488,30 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
     return result;
 }
 
+// Equal to ggml_e8m0_to_fp32/(2^e)
+// Useful with MXFP6 quantization since KValues are stored in different scale
+static inline float ggml_e8m0_to_fp32_any(uint8_t x, const uint8_t e) {
+    uint32_t bits;
+    const int cutoff = e + 1;
+    if (x < cutoff) {
+        // x=0: 0x00040000 = 2^(-131)
+        // x=1: 0x00080000 = 2^(-130)
+        // ...
+        bits = (uint32_t)1 << (x - e + 22);
+    }
+    else {
+        // E = x - e
+        bits = (uint32_t)(x - e) << 23;
+    }
+
+    float result;
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+
 #define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
 #define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
+#define GGML_E8M0_TO_FP32_ANY(x,e) ggml_e8m0_to_fp32_any(x,e)
 
 /**
  * Converts brain16 to float32.
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -270,6 +270,40 @@ static inline int best_index_mxfp4(float x, float e) {
     return best_index;
 }
 
+void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
+    static const int qk = QK_MXFP4;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+            }
+        }
+
+        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0;
+
+        const float d = GGML_E8M0_TO_FP32_HALF(e);
+
+        y[i].e = e;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t x0 = best_index_mxfp4(x[i*qk + 0    + j], d);
+            const uint8_t x1 = best_index_mxfp4(x[i*qk + qk/2 + j], d);
+
+            y[i].qs[j]  = x0;
+            y[i].qs[j] |= x1 << 4;
+        }
+    }
+}
+
 static inline int best_index_mxfp6_e3m2(float x, float e) {
     int best_index = 0;
     float best_err = fabsf(kvalues_mxfp6_e3m2[0]*e - x);
@@ -301,9 +335,9 @@ void quantize_row_mxfp6_e3m2_ref(const float * GGML_RESTRICT x, block_mxfp6_e3m2
             }
         }
 
-        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 4 + 127) : 0;
+        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 3 + 127) : 0;
 
-        const float d = GGML_E8M0_TO_FP32_HALF(e);
+        const float d = GGML_E8M0_TO_FP32_ANY(e, 4);
 
         y[i].e = e;
 
@@ -355,9 +389,9 @@ void quantize_row_mxfp6_e2m3_ref(const float * GGML_RESTRICT x, block_mxfp6_e2m3
             }
         }
 
-        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 3 + 127) : 0;
+        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 1 + 127) : 0;
 
-        const float d = GGML_E8M0_TO_FP32_HALF(e);
+        const float d = GGML_E8M0_TO_FP32_ANY(e, 3);
 
         y[i].e = e;
 
@@ -378,40 +412,6 @@ void quantize_row_mxfp6_e2m3_ref(const float * GGML_RESTRICT x, block_mxfp6_e2m3
     }
 }
 
-void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
-    static const int qk = QK_MXFP4;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-            }
-        }
-
-        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0;
-
-        const float d = GGML_E8M0_TO_FP32_HALF(e);
-
-        y[i].e = e;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t x0 = best_index_mxfp4(x[i*qk + 0    + j], d);
-            const uint8_t x1 = best_index_mxfp4(x[i*qk + qk/2 + j], d);
-
-            y[i].qs[j]  = x0;
-            y[i].qs[j] |= x1 << 4;
-        }
-    }
-}
-
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_0;
 
@@ -550,7 +550,7 @@ void dequantize_row_mxfp6_e3m2(const block_mxfp6_e3m2 * GGML_RESTRICT x, float *
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        const float d = GGML_E8M0_TO_FP32_HALF(x[i].e);
+        const float d = GGML_E8M0_TO_FP32_ANY(x[i].e, 4);
 
         for (int j = 0; j < qk / 4; ++j) {
             const int16_t x0 = kvalues_mxfp6_e3m2[x[i].qs[3 * j] & 0x3F];
@@ -574,7 +574,7 @@ void dequantize_row_mxfp6_e2m3(const block_mxfp6_e2m3 * GGML_RESTRICT x, float *
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        const float d = GGML_E8M0_TO_FP32_HALF(x[i].e);
+        const float d = GGML_E8M0_TO_FP32_ANY(x[i].e, 3);
 
         for (int j = 0; j < qk / 4; ++j) {
             const int16_t x0 = kvalues_mxfp6_e2m3[x[i].qs[3 * j] & 0x3F];
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
@@ -11,12 +11,10 @@
 import numpy as np
 
 
-# see ggml_e8m0_to_fp32_half in ggml-impl.h
-def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray:
-    bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23))
+def e8m0_to_fp32_any(x: np.ndarray, e: np.uint32) -> np.ndarray:
+    bits = np.where(x < e + 1, np.uint32(1) << np.uint32(x - e + 22), np.uint32(x - e) << np.uint32(23))
     return bits.view(np.float32)
 
-
 def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
     block_size, type_size = GGML_QUANT_SIZES[quant_type]
     if shape[-1] % block_size != 0:
@@ -666,7 +664,7 @@ class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
 
     @classmethod
     def __e8m0_to_fp32_half(cls, x: np.ndarray) -> np.ndarray:
-        e8m0_to_fp32_half(x)
+        return e8m0_to_fp32_any(x, 1)
 
     @classmethod
     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
@@ -721,8 +719,8 @@ class MXFP6E3M2(__Quant, qtype=GGMLQuantizationType.MXFP6_E3M2):
     )
 
     @classmethod
-    def __e8m0_to_fp32_half(cls, x: np.ndarray) -> np.ndarray:
-        e8m0_to_fp32_half(x)
+    def __e8m0_to_fp32_scaled(cls, x: np.ndarray) -> np.ndarray:
+        return e8m0_to_fp32_any(x, 4)
 
     @classmethod
     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
@@ -732,12 +730,12 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 
         with np.errstate(divide="ignore"):
             # convert log2(d_max) to e8m0
-            e = np.where(d_max > 0, np.floor(np.log2(d_max)) - 4 + 127, 0).astype(
+            e = np.where(d_max > 0, np.floor(np.log2(d_max)) - 9 + 127, 0).astype(
                 np.uint8
             )
 
         # d is float of above e8m0
-        d = cls.__e8m0_to_fp32_half(e)
+        d = cls.__e8m0_to_fp32_scaled(e)
 
         kvalues = np.array(cls.kvalues, dtype=np.int16).reshape((1, 1, 64))
 
@@ -774,7 +772,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 
         e, qs = np.hsplit(blocks, [1])
 
-        d = cls.__e8m0_to_fp32_half(e).astype(np.float32)
+        d = cls.__e8m0_to_fp32_scaled(e).astype(np.float32)
 
         qs_groups = qs.reshape((n_blocks, -1, 3))
         b0 = qs_groups[..., 0]
@@ -808,8 +806,8 @@ class MXFP6E2M3(__Quant, qtype=GGMLQuantizationType.MXFP6_E2M3):
     )
 
     @classmethod
-    def __e8m0_to_fp32_half(cls, x: np.ndarray) -> np.ndarray:
-        e8m0_to_fp32_half(x)
+    def __e8m0_to_fp32_scaled(cls, x: np.ndarray) -> np.ndarray:
+        return e8m0_to_fp32_any(x, 3)
 
     @classmethod
     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
@@ -819,12 +817,12 @@ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 
         with np.errstate(divide="ignore"):
             # convert log2(d_max) to e8m0
-            e = np.where(d_max > 0, np.floor(np.log2(d_max)) - 3 + 127, 0).astype(
+            e = np.where(d_max > 0, np.floor(np.log2(d_max)) - 6 + 127, 0).astype(
                 np.uint8
             )
 
         # d is float of above e8m0
-        d = cls.__e8m0_to_fp32_half(e)
+        d = cls.__e8m0_to_fp32_scaled(e)
 
         kvalues = np.array(cls.kvalues, dtype=np.int16).reshape((1, 1, 64))
 
@@ -861,7 +859,7 @@ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 
         e, qs = np.hsplit(blocks, [1])
 
-        d = cls.__e8m0_to_fp32_half(e).astype(np.float32)
+        d = cls.__e8m0_to_fp32_scaled(e).astype(np.float32)
 
         qs_groups = qs.reshape((n_blocks, -1, 3))
         b0 = qs_groups[..., 0]