77
88#include " ggml-fp8.h"
99
10- /*
11- make clean
12- make -j8
13- # ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
14- ./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
15- ./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
16- ./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
17-
18- rm -rf build
19- cmake -B build
20- cmake --build build --config Release -j $(nproc)
21- ./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
22- ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
23- ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
24-
25- # la CI local:
26- rm -rf tmp
27- mkdir tmp
28- bash ./ci/run.sh ./tmp/results ./tmp/mnt
29-
30- # HIP legacy target?
31- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
32- cmake --build build2 --config Release -j $(nproc) -v
33-
34- */
35-
36- template <int N> constexpr float EXP2 () {
37- if constexpr (N==0 ) return 1 ;
38- if constexpr (N>0 ) return EXP2<N-1 >()*2 ;
39- if constexpr (N<0 ) return EXP2<N+1 >()/2 ;
10+ static constexpr float exp2 (int n) {
11+ return n == 0 ? 1 : n > 0 ? 2 * exp2 (n - 1 ) : exp2 (n + 1 ) / 2 ;
4012}
41-
42- // 2^N avec N>0 en entier
43- template <int N> constexpr int EXP_I2 () {
44- if constexpr (N==0 ) return 1 ;
45- if constexpr (N>0 ) return EXP_I2<N-1 >()*2 ;
13+ static constexpr float exp_i2 (int n) {
14+ return 1 << n;
4615}
4716
48- template <int _E > // , int M=7-E> 1.7 bits!
17+ template <int E > // , int M=7-E> 1.7 bits!
4918struct FP8 {
5019 uint8_t bits;
51- using type = FP8<_E >;
52- static constexpr int E=_E;
53- static constexpr int M=7 -_E ;
54- static constexpr int E_BIAS=EXP2<_E- 1 >( )-1 ;
55- static constexpr float MAX () { return (2 -EXP2< -M+1 >( ))*EXP2<EXP_I2<_E- 1 >()>( ); }
56- static constexpr float MIN () { return EXP2<-M>()*EXP2< 2 -EXP_I2<_E- 1 >()>( ); }
20+ using type = FP8<E >;
21+ // static constexpr int E=_E;
22+ static const int M=7 -E ;
23+ static const int E_BIAS=exp_i2( E-1 )-1 ;
24+ static constexpr float MAX () { return (2 -exp2 ( -M+1 ))*exp2 ( exp_i2 ( E-1 ) ); }
25+ static constexpr float MIN () { return exp2 (-M)* exp2 ( 2 - exp_i2 ( E-1 ) ); }
5726 // =============================================
5827
5928#ifdef GGML_USE_OPENMP_SIMD
@@ -64,17 +33,17 @@ struct FP8 {
6433 float f;
6534 uint32_t bits;
6635 } in = {value};
67- // le signe:
36+ // the signe:
6837 bits = (in.bits >> 24 ) & 0x80 ;
69- // la valeur sans la signe!
38+ // value without signe!
7039 in.bits &= 0x7fffffff ;
7140 // GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
7241 if (in.f >= MAX ()) {
7342 bits |= 0x7E ;
7443 } else if (in.f <MIN ()) { // => 0.
7544 // OK: S.0000000
7645 } else {
77- in.f *= EXP2< E_BIAS-127 >( );
46+ in.f *= exp2 ( E_BIAS-127 );
7847 in.bits += 1 <<(22 -M); // for rounding
7948 bits |= (in.bits >> (23 -M)) & 0x7F ;
8049 }
@@ -88,13 +57,12 @@ struct FP8 {
8857 float f;
8958 uint32_t bits;
9059 } out = {0 };
91- // le signe:
9260 out.bits = bits & 0x80 ;
9361 out.bits <<= 24 ;
9462 uint32_t _bits = bits & 0x7F ;
9563 _bits <<= (23 -M);
9664 out.bits |= _bits;
97- out.f *= EXP2< 127 -E_BIAS>( );
65+ out.f *= exp2 ( 127 -E_BIAS);
9866 return out.f ;
9967 }
10068};
@@ -156,7 +124,7 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
156124 for (int64_t q=0 ; q<qk_size; ++q) {
157125 float m = 0 ;
158126#ifdef GGML_USE_OPENMP_SIMD
159- // not work on macos and warn.
127+ // did not work on macOS and warn.
160128 // #pragma omp simd reduction(max:m)
161129#endif
162130 for (int64_t i=0 ; i<QK; i++) {
0 commit comments