33// GGML internal header
44
55#include " ggml.h"
6-
76#include < assert.h>
87#include < math.h>
98#include < stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
109#include < stdbool.h>
1110#include < stdint.h>
1211#include < string.h>
1312
13+ #ifdef __ARM_FEATURE_SVE
14+ #include < arm_sve.h>
15+ #endif // __ARM_FEATURE_SVE
16+
17+ #if defined(__ARM_NEON)
18+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
19+ //
20+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
21+ //
22+ #include < arm_neon.h>
23+ #endif
24+
1425#ifdef __cplusplus
1526extern " C" {
1627#endif
@@ -29,13 +40,13 @@ extern "C" {
2940// if C99 - static_assert is noop
3041// ref: https://stackoverflow.com/a/53923785/4039976
3142#ifndef __cplusplus
32- #ifndef static_assert
33- #if defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 201100L )
34- #define static_assert (cond , msg ) _Static_assert(cond, msg)
35- #else
36- #define static_assert (cond , msg ) struct global_scope_noop_trick
37- #endif
38- #endif
43+ #ifndef static_assert
44+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
45+ #define static_assert (cond, msg ) _Static_assert (cond, msg)
46+ #else
47+ #define static_assert (cond, msg ) struct global_scope_noop_trick
48+ #endif
49+ #endif
3950#endif
4051
4152static inline int ggml_up32 (int n) {
@@ -121,14 +132,12 @@ struct ggml_map_custom1_op_params {
121132 void * userdata;
122133};
123134
124-
125135struct ggml_map_custom2_op_params {
126136 ggml_custom2_op_t fun;
127137 int n_tasks;
128138 void * userdata;
129139};
130140
131-
132141struct ggml_map_custom3_op_params {
133142 ggml_custom3_op_t fun;
134143 int n_tasks;
@@ -291,167 +300,152 @@ void ggml_aligned_free(void * ptr, size_t size);
291300// FP16 to FP32 conversion
292301
293302#if defined(__ARM_NEON)
294-
295- // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
296- //
297- // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
298- //
299- #include <arm_neon.h>
300-
301- #ifdef _MSC_VER
302- typedef uint16_t ggml_fp16_internal_t ;
303- #else
304- typedef __fp16 ggml_fp16_internal_t ;
305- #endif
303+ #ifdef _MSC_VER
304+ typedef uint16_t ggml_fp16_internal_t ;
305+ #else
306+ typedef __fp16 ggml_fp16_internal_t ;
307+ #endif
306308#endif
307309
308310#if defined(__ARM_NEON) && !defined(_MSC_VER)
311+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
312+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
309313
310- #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
311- #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
312-
313- #define GGML_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
314+ #define GGML_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
314315
315- static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h ) {
316- ggml_fp16_internal_t tmp ;
317- memcpy (& tmp , & h , sizeof (ggml_fp16_t ));
318- return (float )tmp ;
319- }
320-
321- static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f ) {
322- ggml_fp16_t res ;
323- ggml_fp16_internal_t tmp = f ;
324- memcpy (& res , & tmp , sizeof (ggml_fp16_t ));
325- return res ;
326- }
316+ static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h) {
317+ ggml_fp16_internal_t tmp;
318+ memcpy (&tmp, &h, sizeof (ggml_fp16_t ));
319+ return (float )tmp;
320+ }
327321
328- #else
322+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f) {
323+ ggml_fp16_t res;
324+ ggml_fp16_internal_t tmp = f;
325+ memcpy (&res, &tmp, sizeof (ggml_fp16_t ));
326+ return res;
327+ }
329328
330- #ifdef __F16C__
329+ #elif defined( __F16C__)
331330
332- #ifdef _MSC_VER
333- #define GGML_COMPUTE_FP16_TO_FP32 (x ) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
334- #define GGML_COMPUTE_FP32_TO_FP16 (x ) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
335- #else
336- #define GGML_COMPUTE_FP16_TO_FP32 (x ) _cvtsh_ss(x)
337- #define GGML_COMPUTE_FP32_TO_FP16 (x ) _cvtss_sh(x, 0)
338- #endif
331+ #ifdef _MSC_VER
332+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
333+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0 ), 0 )
334+ #else
335+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) _cvtsh_ss(x)
336+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) _cvtss_sh(x, 0 )
337+ #endif
339338
340339#elif defined(__POWER9_VECTOR__)
341340
342- #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
343- #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
344- /* the inline asm below is about 12% faster than the lookup method */
345- #define GGML_FP16_TO_FP32 (x ) GGML_COMPUTE_FP16_TO_FP32(x)
346- #define GGML_FP32_TO_FP16 (x ) GGML_COMPUTE_FP32_TO_FP16(x)
347-
348- static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h ) {
349- register float f ;
350- register double d ;
351- __asm__(
352- "mtfprd %0,%2\n"
353- "xscvhpdp %0,%0\n"
354- "frsp %1,%0\n" :
355- /* temp */ "=d" (d ),
356- /* out */ "=f" (f ):
357- /* in */ "r" (h ));
358- return f ;
359- }
341+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
342+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
343+ /* the inline asm below is about 12% faster than the lookup method */
344+ #define GGML_FP16_TO_FP32 (x ) GGML_COMPUTE_FP16_TO_FP32(x)
345+ #define GGML_FP32_TO_FP16 (x ) GGML_COMPUTE_FP32_TO_FP16(x)
346+
347+ static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h) {
348+ register float f;
349+ register double d;
350+ __asm__ (
351+ " mtfprd %0,%2\n "
352+ " xscvhpdp %0,%0\n "
353+ " frsp %1,%0\n " :
354+ /* temp */ " =d" (d),
355+ /* out */ " =f" (f):
356+ /* in */ " r" (h));
357+ return f;
358+ }
360359
361- static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f ) {
362- register double d ;
363- register ggml_fp16_t r ;
364- __asm__( /* xscvdphp can work on double or single precision */
365- "xscvdphp %0,%2\n"
366- "mffprd %1,%0\n" :
367- /* temp */ "=d" (d ),
368- /* out */ "=r" (r ):
369- /* in */ "f" (f ));
370- return r ;
371- }
360+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f) {
361+ register double d;
362+ register ggml_fp16_t r;
363+ __asm__ ( /* xscvdphp can work on double or single precision */
364+ " xscvdphp %0,%2\n "
365+ " mffprd %1,%0\n " :
366+ /* temp */ " =d" (d),
367+ /* out */ " =r" (r):
368+ /* in */ " f" (f));
369+ return r;
370+ }
372371
373372#else
374373
375- // FP16 <-> FP32
376- // ref: https://github.com/Maratyszcza/FP16
377-
378- static inline float fp32_from_bits (uint32_t w ) {
379- union {
380- uint32_t as_bits ;
381- float as_value ;
382- } fp32 ;
383- fp32 .as_bits = w ;
384- return fp32 .as_value ;
385- }
386-
387- static inline uint32_t fp32_to_bits (float f ) {
388- union {
389- float as_value ;
390- uint32_t as_bits ;
391- } fp32 ;
392- fp32 .as_value = f ;
393- return fp32 .as_bits ;
394- }
374+ // FP16 <-> FP32
375+ // ref: https://github.com/Maratyszcza/FP16
395376
396- static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h ) {
397- const uint32_t w = (uint32_t ) h << 16 ;
398- const uint32_t sign = w & UINT32_C (0x80000000 );
399- const uint32_t two_w = w + w ;
400-
401- const uint32_t exp_offset = UINT32_C (0xE0 ) << 23 ;
402- #if defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 199901L ) || defined(__GNUC__ ) && !defined(__STRICT_ANSI__ )
403- const float exp_scale = 0x1.0p-112f ;
404- #else
405- const float exp_scale = fp32_from_bits (UINT32_C (0x7800000 ));
406- #endif
407- const float normalized_value = fp32_from_bits ((two_w >> 4 ) + exp_offset ) * exp_scale ;
408-
409- const uint32_t magic_mask = UINT32_C (126 ) << 23 ;
410- const float magic_bias = 0.5f ;
411- const float denormalized_value = fp32_from_bits ((two_w >> 17 ) | magic_mask ) - magic_bias ;
377+ static inline float fp32_from_bits (uint32_t w) {
378+ union {
379+ uint32_t as_bits;
380+ float as_value;
381+ } fp32;
382+ fp32.as_bits = w;
383+ return fp32.as_value ;
384+ }
412385
413- const uint32_t denormalized_cutoff = UINT32_C (1 ) << 27 ;
414- const uint32_t result = sign |
415- (two_w < denormalized_cutoff ? fp32_to_bits (denormalized_value ) : fp32_to_bits (normalized_value ));
416- return fp32_from_bits (result );
417- }
386+ static inline uint32_t fp32_to_bits (float f) {
387+ union {
388+ float as_value;
389+ uint32_t as_bits;
390+ } fp32;
391+ fp32.as_value = f;
392+ return fp32.as_bits ;
393+ }
418394
419- static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f ) {
420- #if defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 199901L ) || defined(__GNUC__ ) && !defined(__STRICT_ANSI__ )
421- const float scale_to_inf = 0x1.0p+112f ;
422- const float scale_to_zero = 0x1.0p-110f ;
423- #else
424- const float scale_to_inf = fp32_from_bits (UINT32_C (0x77800000 ));
425- const float scale_to_zero = fp32_from_bits (UINT32_C (0x08800000 ));
426- #endif
427- float base = (fabsf (f ) * scale_to_inf ) * scale_to_zero ;
428-
429- const uint32_t w = fp32_to_bits (f );
430- const uint32_t shl1_w = w + w ;
431- const uint32_t sign = w & UINT32_C (0x80000000 );
432- uint32_t bias = shl1_w & UINT32_C (0xFF000000 );
433- if (bias < UINT32_C (0x71000000 )) {
434- bias = UINT32_C (0x71000000 );
395+ static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h) {
396+ const uint32_t w = (uint32_t ) h << 16 ;
397+ const uint32_t sign = w & UINT32_C (0x80000000 );
398+ const uint32_t two_w = w + w;
399+
400+ const uint32_t exp_offset = UINT32_C (0xE0 ) << 23 ;
401+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
402+ const float exp_scale = 0x1 .0p-112f ;
403+ #else
404+ const float exp_scale = fp32_from_bits (UINT32_C (0x7800000 ));
405+ #endif
406+ const float normalized_value = fp32_from_bits ((two_w >> 4 ) + exp_offset) * exp_scale;
407+
408+ const uint32_t magic_mask = UINT32_C (126 ) << 23 ;
409+ const float magic_bias = 0 .5f ;
410+ const float denormalized_value = fp32_from_bits ((two_w >> 17 ) | magic_mask) - magic_bias;
411+
412+ const uint32_t denormalized_cutoff = UINT32_C (1 ) << 27 ;
413+ const uint32_t result = sign |
414+ (two_w < denormalized_cutoff ? fp32_to_bits (denormalized_value) : fp32_to_bits (normalized_value));
415+ return fp32_from_bits (result);
435416 }
436417
437- base = fp32_from_bits ((bias >> 1 ) + UINT32_C (0x07800000 )) + base ;
438- const uint32_t bits = fp32_to_bits (base );
439- const uint32_t exp_bits = (bits >> 13 ) & UINT32_C (0x00007C00 );
440- const uint32_t mantissa_bits = bits & UINT32_C (0x00000FFF );
441- const uint32_t nonsign = exp_bits + mantissa_bits ;
442- return (sign >> 16 ) | (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign );
443- }
418+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f) {
419+ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
420+ const float scale_to_inf = 0x1 .0p+112f ;
421+ const float scale_to_zero = 0x1 .0p-110f ;
422+ #else
423+ const float scale_to_inf = fp32_from_bits (UINT32_C (0x77800000 ));
424+ const float scale_to_zero = fp32_from_bits (UINT32_C (0x08800000 ));
425+ #endif
426+ float base = (fabsf (f) * scale_to_inf) * scale_to_zero;
427+
428+ const uint32_t w = fp32_to_bits (f);
429+ const uint32_t shl1_w = w + w;
430+ const uint32_t sign = w & UINT32_C (0x80000000 );
431+ uint32_t bias = shl1_w & UINT32_C (0xFF000000 );
432+ if (bias < UINT32_C (0x71000000 )) {
433+ bias = UINT32_C (0x71000000 );
434+ }
444435
445- #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
446- #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
436+ base = fp32_from_bits ((bias >> 1 ) + UINT32_C (0x07800000 )) + base;
437+ const uint32_t bits = fp32_to_bits (base);
438+ const uint32_t exp_bits = (bits >> 13 ) & UINT32_C (0x00007C00 );
439+ const uint32_t mantissa_bits = bits & UINT32_C (0x00000FFF );
440+ const uint32_t nonsign = exp_bits + mantissa_bits;
441+ return (sign >> 16 ) | (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign);
442+ }
447443
448- #endif // __F16C__
444+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32(x)
445+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16(x)
449446
450447#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
451448
452- #ifdef __ARM_FEATURE_SVE
453- #include <arm_sve.h>
454- #endif // __ARM_FEATURE_SVE
455449
456450// precomputed f32 table for f16 (256 KB)
457451// defined in ggml.c, initialized in ggml_init()
0 commit comments