| 
 | 1 | +#define GGML_COMMON_IMPL_C  | 
 | 2 | +#include "ggml-common.h"  | 
 | 3 | + | 
 | 4 | +#include "ggml-fp8.h"  | 
 | 5 | + | 
 | 6 | +#include <cassert>  | 
 | 7 | + | 
 | 8 | +/*  | 
 | 9 | +# ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q  | 
 | 10 | +./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q  | 
 | 11 | +./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42  | 
 | 12 | +# ./llama-perplexity -f ~/LLM/wikitext-2-raw/wiki.test.raw  -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf  | 
 | 13 | +./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf  | 
 | 14 | +
  | 
 | 15 | +*/  | 
 | 16 | + | 
 | 17 | +#include <iostream>  | 
 | 18 | +#include <cstdint>  | 
 | 19 | +#include <immintrin.h>  | 
 | 20 | + | 
 | 21 | +template<int N> constexpr float EXP2() {  | 
 | 22 | +    if constexpr (N==0) return 1;  | 
 | 23 | +    if constexpr (N>0) return EXP2<N-1>()*2;  | 
 | 24 | +    if constexpr (N<0) return EXP2<N+1>()/2;  | 
 | 25 | +}  | 
 | 26 | + | 
 | 27 | +// 2^N avec N>0 en entier  | 
 | 28 | +template<int N> constexpr int EXP_I2() {  | 
 | 29 | +    if constexpr (N==0) return 1;  | 
 | 30 | +    if constexpr (N>0) return EXP_I2<N-1>()*2;  | 
 | 31 | +}  | 
 | 32 | + | 
 | 33 | +template<int _E> //, int M=7-E>  1.7 bits!  | 
 | 34 | +struct FP8 {  | 
 | 35 | +    uint8_t bits;  | 
 | 36 | +    using type = FP8<_E>;  | 
 | 37 | +    static constexpr int E=_E;  | 
 | 38 | +    static constexpr int M=7-_E;  | 
 | 39 | +    static constexpr int E_BIAS=EXP2<_E-1>()-1;  | 
 | 40 | +    static constexpr float MAX() { return (2-EXP2<-M+1>())*EXP2<EXP_I2<_E-1>()>(); }  | 
 | 41 | +    static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); }  | 
 | 42 | +    //=============================================  | 
 | 43 | + | 
 | 44 | +    #pragma omp declare simd  | 
 | 45 | +    void operator=(float value) {  | 
 | 46 | +        union {  | 
 | 47 | +            float f;  | 
 | 48 | +            uint32_t bits;  | 
 | 49 | +        } in = {value};  | 
 | 50 | +        // le signe:  | 
 | 51 | +        bits = (in.bits >> 24) & 0x80;  | 
 | 52 | +        // la valeur sans la signe!  | 
 | 53 | +        in.bits &= 0x7fffffff;  | 
 | 54 | +        //GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN  | 
 | 55 | +        if (in.f >= MAX()) {  | 
 | 56 | +            bits |= 0x7E;  | 
 | 57 | +        } else if (in.f<MIN()) { // => 0.  | 
 | 58 | +            // OK: S.0000000  | 
 | 59 | +        } else {  | 
 | 60 | +            in.f *= EXP2<E_BIAS-127>();  | 
 | 61 | +            in.bits += 1<<(22-M); // for rounding  | 
 | 62 | +            bits |= (in.bits >> (23-M)) & 0x7F;  | 
 | 63 | +        }  | 
 | 64 | +    }  | 
 | 65 | + | 
 | 66 | +    #pragma omp declare simd  | 
 | 67 | +    operator float () const {  | 
 | 68 | +        union {  | 
 | 69 | +            float f;  | 
 | 70 | +            uint32_t bits;  | 
 | 71 | +        } out = {0};  | 
 | 72 | +        // le signe:  | 
 | 73 | +        out.bits = bits & 0x80;  | 
 | 74 | +        out.bits <<= 24;  | 
 | 75 | +        uint32_t _bits = bits & 0x7F;  | 
 | 76 | +        _bits <<= (23-M);  | 
 | 77 | +        out.bits |= _bits;  | 
 | 78 | +        out.f *= EXP2<127-E_BIAS>();  | 
 | 79 | +        return out.f;  | 
 | 80 | +    }  | 
 | 81 | +};  | 
 | 82 | + | 
 | 83 | +// block_e4m3_q  | 
 | 84 | +//typedef struct {  | 
 | 85 | +//    float d;  // delta  | 
 | 86 | +//    ggml_e4m3 qs[QK_K];  | 
 | 87 | +//} block_e4m3_q;  | 
 | 88 | + | 
 | 89 | +template<int E>  | 
 | 90 | +static inline void conv(const FP8<E>* x, float* y, int64_t size) {  | 
 | 91 | +    #pragma omp simd  | 
 | 92 | +    for (int64_t i=0; i<size; i++) {  | 
 | 93 | +        y[i] = (float) x[i];  | 
 | 94 | +    }  | 
 | 95 | +}  | 
 | 96 | + | 
 | 97 | +template<int E>  | 
 | 98 | +static inline void conv(const float* x, FP8<E>* y, int64_t size) {  | 
 | 99 | +    #pragma omp simd  | 
 | 100 | +    for (int64_t i=0; i<size; i++) {  | 
 | 101 | +        y[i] = x[i];  | 
 | 102 | +    }  | 
 | 103 | +}  | 
 | 104 | + | 
 | 105 | +template<int E>  | 
 | 106 | +static inline float dot(const FP8<E>* x, const float* y, int64_t size) {  | 
 | 107 | +    float z = 0;  | 
 | 108 | +    #pragma omp simd reduction(+:z)  | 
 | 109 | +    for (int64_t i=0; i<size; i++) {  | 
 | 110 | +        z += ((float)x[i])*y[i];  | 
 | 111 | +    }  | 
 | 112 | +    return z;  | 
 | 113 | +}  | 
 | 114 | + | 
 | 115 | +template <int E, int QK>  | 
 | 116 | +struct bloc_fp8 {  | 
 | 117 | +    float d;  | 
 | 118 | +    FP8<E> qs[QK];  | 
 | 119 | +};  | 
 | 120 | + | 
 | 121 | +template <int E, int QK>  | 
 | 122 | +static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {  | 
 | 123 | +    const auto qk_size = size / QK;  | 
 | 124 | +    for (int64_t q=0; q<qk_size; ++q) {  | 
 | 125 | +        #pragma omp simd  | 
 | 126 | +        for (int64_t i=0; i<QK; i++) {  | 
 | 127 | +            y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d;  | 
 | 128 | +        }  | 
 | 129 | +    }  | 
 | 130 | +}  | 
 | 131 | + | 
 | 132 | +template <int E, int QK>  | 
 | 133 | +static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {  | 
 | 134 | +    const auto qk_size = size / QK;  | 
 | 135 | +    for (int64_t q=0; q<qk_size; ++q) {  | 
 | 136 | +        float m = 0;  | 
 | 137 | +        #pragma omp simd reduction(max:m)  | 
 | 138 | +        for (int64_t i=0; i<QK; i++) {  | 
 | 139 | +            m = std::max(std::abs(x[q*QK+i]),m);  | 
 | 140 | +        }  | 
 | 141 | +        const float D = FP8<E>::MAX()/m;  | 
 | 142 | +        y[q].d = m/FP8<E>::MAX();  | 
 | 143 | +        #pragma omp simd  | 
 | 144 | +        for (int64_t i=0; i<QK; i++) {  | 
 | 145 | +            y[q].qs[i] = x[q*QK+i]*D;  | 
 | 146 | +        }  | 
 | 147 | +    }  | 
 | 148 | +}  | 
 | 149 | + | 
 | 150 | +template <int E, int QK>  | 
 | 151 | +static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) {  | 
 | 152 | +    float z = 0;  | 
 | 153 | +    const auto qk_size = size / QK;  | 
 | 154 | +    for (int64_t q=0; q<qk_size; ++q) {  | 
 | 155 | +        float z0 = 0;  | 
 | 156 | +        #pragma omp simd reduction(+:z0)  | 
 | 157 | +        for (int64_t i=0; i<QK; i++) {  | 
 | 158 | +            z0 += ((float)x[q].qs[i])*y[q*QK+i];  | 
 | 159 | +        }  | 
 | 160 | +        z += (x[q]).d * z0;  | 
 | 161 | +    }  | 
 | 162 | +    return z;  | 
 | 163 | +}  | 
 | 164 | + | 
 | 165 | +// the C API.  | 
 | 166 | +void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {  | 
 | 167 | +    conv(reinterpret_cast<const FP8<5>*>(x), y, k);  | 
 | 168 | +}  | 
 | 169 | +void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {  | 
 | 170 | +    conv(x, reinterpret_cast<FP8<5>*>(y), k);  | 
 | 171 | +}  | 
 | 172 | +void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {  | 
 | 173 | +    for (int64_t i =0; i<k; ++i) {  | 
 | 174 | +        reinterpret_cast<FP8<5>*>(y)[i] = x[i];  | 
 | 175 | +    }  | 
 | 176 | +}  | 
 | 177 | + | 
 | 178 | +void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {  | 
 | 179 | +    conv(reinterpret_cast<const FP8<4>*>(x), y, k);  | 
 | 180 | +}  | 
 | 181 | +void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {  | 
 | 182 | +    conv(x, reinterpret_cast<FP8<4>*>(y), k);  | 
 | 183 | +}  | 
 | 184 | +void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {  | 
 | 185 | +    for (int64_t i =0; i<k; ++i) {  | 
 | 186 | +        reinterpret_cast<FP8<4>*>(y)[i] = x[i];  | 
 | 187 | +    }  | 
 | 188 | +}  | 
 | 189 | + | 
 | 190 | +void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {  | 
 | 191 | +    assert(k % QK_K == 0);  | 
 | 192 | +    conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);  | 
 | 193 | +}  | 
 | 194 | +void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {  | 
 | 195 | +    assert(k % QK_K == 0);  | 
 | 196 | +    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);  | 
 | 197 | +}  | 
 | 198 | +void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {  | 
 | 199 | +    assert(k % QK_K == 0);  | 
 | 200 | +    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);  | 
 | 201 | +}  | 
 | 202 | + | 
 | 203 | +void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {  | 
 | 204 | +    assert(k % QK_K == 0);  | 
 | 205 | +    conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);  | 
 | 206 | +}  | 
 | 207 | +void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {  | 
 | 208 | +    assert(k % QK_K == 0);  | 
 | 209 | +    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);  | 
 | 210 | +}  | 
 | 211 | +void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {  | 
 | 212 | +    assert(k % QK_K == 0);  | 
 | 213 | +    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);  | 
 | 214 | +}  | 
 | 215 | + | 
 | 216 | +// the dot product for FP8 weight  | 
 | 217 | +void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {  | 
 | 218 | +    assert(nrc == 1);  | 
 | 219 | +    GGML_UNUSED(nrc);  | 
 | 220 | +    GGML_UNUSED(bx);  | 
 | 221 | +    GGML_UNUSED(by);  | 
 | 222 | +    GGML_UNUSED(bs);  | 
 | 223 | +    *s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n);  | 
 | 224 | +}  | 
 | 225 | + | 
 | 226 | +void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {  | 
 | 227 | +    assert(nrc == 1);  | 
 | 228 | +    GGML_UNUSED(nrc);  | 
 | 229 | +    GGML_UNUSED(bx);  | 
 | 230 | +    GGML_UNUSED(by);  | 
 | 231 | +    GGML_UNUSED(bs);  | 
 | 232 | +    *s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n);  | 
 | 233 | +}  | 
 | 234 | + | 
 | 235 | +void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {  | 
 | 236 | +    assert(nrc == 1);  | 
 | 237 | +    GGML_UNUSED(nrc);  | 
 | 238 | +    GGML_UNUSED(bx);  | 
 | 239 | +    GGML_UNUSED(by);  | 
 | 240 | +    GGML_UNUSED(bs);  | 
 | 241 | +    *s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);  | 
 | 242 | +}  | 
 | 243 | + | 
 | 244 | +void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {  | 
 | 245 | +    assert(nrc == 1);  | 
 | 246 | +    GGML_UNUSED(nrc);  | 
 | 247 | +    GGML_UNUSED(bx);  | 
 | 248 | +    GGML_UNUSED(by);  | 
 | 249 | +    GGML_UNUSED(bs);  | 
 | 250 | +    *s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);  | 
 | 251 | +}  | 
0 commit comments