88#include " ggml-fp8.h"
99
1010/*
11- make clean
12- make -j8
13- # ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
14- ./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
15- ./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
16- ./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
17-
18- rm -rf build
19- cmake -B build
20- cmake --build build --config Release -j $(nproc)
21- ./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
22- ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
23- ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
24-
25- # la CI local:
26- rm -rf tmp
27- mkdir tmp
28- bash ./ci/run.sh ./tmp/results ./tmp/mnt
29-
30- # HIP legacy target?
31- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
32- cmake --build build2 --config Release -j $(nproc) -v
33-
11+ template<int N>
12+ constexpr float exp2() {
13+ union {
14+ float f;
15+ uint32_t bits;
16+ } out = {0};
17+ out.bits = (N+127)<<23;
18+ return out.f;
19+ }
3420*/
35-
36- template <int N> constexpr float EXP2 () {
37- if constexpr (N==0 ) return 1 ;
38- if constexpr (N>0 ) return EXP2<N-1 >()*2 ;
39- if constexpr (N<0 ) return EXP2<N+1 >()/2 ;
21+ template <int N>
22+ constexpr float exp2 () {
23+ uint32_t bits = (N+127 )<<23 ;
24+ return reinterpret_cast <float >(bits);
4025}
41-
42- // 2^N avec N>0 en entier
43- template <int N> constexpr int EXP_I2 () {
44- if constexpr (N==0 ) return 1 ;
45- if constexpr (N>0 ) return EXP_I2<N-1 >()*2 ;
26+ template <int N>
27+ constexpr int exp_i2 () {
28+ return 1 << N;
4629}
4730
48- template <int _E > // , int M=7-E> 1.7 bits!
31+ template <int E > // , int M=7-E> 1.7 bits!
4932struct FP8 {
5033 uint8_t bits;
51- using type = FP8<_E >;
52- static constexpr int E=_E;
53- static constexpr int M= 7 -_E;
54- static constexpr int E_BIAS=EXP2<_E -1 >()-1 ;
55- static constexpr float MAX () { return (2 -EXP2 <-M+1 >())*EXP2<EXP_I2<_E -1 >()>(); }
56- static constexpr float MIN () { return EXP2 <-M>()*EXP2 <2 -EXP_I2<_E -1 >()>(); }
34+ using type = FP8<E >;
35+ // static constexpr int E=_E;
36+ static constexpr int M () { return 7 -E; }
37+ static constexpr int E_BIAS () { return exp_i2< E -1 >()-1 ; }
38+ static constexpr float MAX () { return (2 -exp2 <-M () +1 >())*exp2<exp_i2< E -1 >()>(); }
39+ static constexpr float MIN () { return exp2 <-M () >()*exp2 <2 -exp_i2< E -1 >()>(); }
5740 // =============================================
5841
5942#ifdef GGML_USE_OPENMP_SIMD
@@ -64,19 +47,19 @@ struct FP8 {
6447 float f;
6548 uint32_t bits;
6649 } in = {value};
67- // le signe:
50+ // the signe:
6851 bits = (in.bits >> 24 ) & 0x80 ;
69- // la valeur sans la signe!
52+ // value without signe!
7053 in.bits &= 0x7fffffff ;
7154 // GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
7255 if (in.f >= MAX ()) {
7356 bits |= 0x7E ;
7457 } else if (in.f <MIN ()) { // => 0.
7558 // OK: S.0000000
7659 } else {
77- in.f *= EXP2 <E_BIAS-127 >();
78- in.bits += 1 <<(22 -M); // for rounding
79- bits |= (in.bits >> (23 -M)) & 0x7F ;
60+ in.f *= exp2 <E_BIAS () -127 >();
61+ in.bits += 1 <<(22 -M () ); // for rounding
62+ bits |= (in.bits >> (23 -M () )) & 0x7F ;
8063 }
8164 }
8265
@@ -88,13 +71,12 @@ struct FP8 {
8871 float f;
8972 uint32_t bits;
9073 } out = {0 };
91- // le signe:
9274 out.bits = bits & 0x80 ;
9375 out.bits <<= 24 ;
9476 uint32_t _bits = bits & 0x7F ;
95- _bits <<= (23 -M);
77+ _bits <<= (23 -M () );
9678 out.bits |= _bits;
97- out.f *= EXP2 <127 -E_BIAS>();
79+ out.f *= exp2 <127 -E_BIAS () >();
9880 return out.f ;
9981 }
10082};
@@ -156,7 +138,7 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
156138 for (int64_t q=0 ; q<qk_size; ++q) {
157139 float m = 0 ;
158140#ifdef GGML_USE_OPENMP_SIMD
159- // not work on macos and warn.
141+ // did not work on macOS and warn.
160142 // #pragma omp simd reduction(max:m)
161143#endif
162144 for (int64_t i=0 ; i<QK; i++) {
0 commit comments