Skip to content

Commit 74cb026

Browse files
committed
build fixes
1 parent bf79cb3 commit 74cb026

File tree

2 files changed

+143
-149
lines changed

2 files changed

+143
-149
lines changed

ggml/src/ggml-backend-reg.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
#include "ggml-backend.h"
21
#include "ggml-backend-impl.h"
2+
#include "ggml-backend.h"
3+
#include "ggml-cpu.h"
4+
#include "ggml-impl.h"
35
#include <cstring>
46
#include <vector>
57

@@ -45,8 +47,6 @@
4547
#include "ggml-kompute.h"
4648
#endif
4749

48-
#include "ggml-cpu.h"
49-
5050
struct ggml_backend_registry {
5151
std::vector<ggml_backend_reg_t> backends;
5252
std::vector<ggml_backend_dev_t> devices;

ggml/src/ggml-impl.h

Lines changed: 140 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,25 @@
33
// GGML internal header
44

55
#include "ggml.h"
6-
76
#include <assert.h>
87
#include <math.h>
98
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
109
#include <stdbool.h>
1110
#include <stdint.h>
1211
#include <string.h>
1312

13+
#ifdef __ARM_FEATURE_SVE
14+
#include <arm_sve.h>
15+
#endif // __ARM_FEATURE_SVE
16+
17+
#if defined(__ARM_NEON)
18+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
19+
//
20+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
21+
//
22+
#include <arm_neon.h>
23+
#endif
24+
1425
#ifdef __cplusplus
1526
extern "C" {
1627
#endif
@@ -29,13 +40,13 @@ extern "C" {
2940
// if C99 - static_assert is noop
3041
// ref: https://stackoverflow.com/a/53923785/4039976
3142
#ifndef __cplusplus
32-
#ifndef static_assert
33-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
34-
#define static_assert(cond, msg) _Static_assert(cond, msg)
35-
#else
36-
#define static_assert(cond, msg) struct global_scope_noop_trick
37-
#endif
38-
#endif
43+
#ifndef static_assert
44+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
45+
#define static_assert(cond, msg) _Static_assert(cond, msg)
46+
#else
47+
#define static_assert(cond, msg) struct global_scope_noop_trick
48+
#endif
49+
#endif
3950
#endif
4051

4152
static inline int ggml_up32(int n) {
@@ -121,14 +132,12 @@ struct ggml_map_custom1_op_params {
121132
void * userdata;
122133
};
123134

124-
125135
struct ggml_map_custom2_op_params {
126136
ggml_custom2_op_t fun;
127137
int n_tasks;
128138
void * userdata;
129139
};
130140

131-
132141
struct ggml_map_custom3_op_params {
133142
ggml_custom3_op_t fun;
134143
int n_tasks;
@@ -291,167 +300,152 @@ void ggml_aligned_free(void * ptr, size_t size);
291300
// FP16 to FP32 conversion
292301

293302
#if defined(__ARM_NEON)
294-
295-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
296-
//
297-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
298-
//
299-
#include <arm_neon.h>
300-
301-
#ifdef _MSC_VER
302-
typedef uint16_t ggml_fp16_internal_t;
303-
#else
304-
typedef __fp16 ggml_fp16_internal_t;
305-
#endif
303+
#ifdef _MSC_VER
304+
typedef uint16_t ggml_fp16_internal_t;
305+
#else
306+
typedef __fp16 ggml_fp16_internal_t;
307+
#endif
306308
#endif
307309

308310
#if defined(__ARM_NEON) && !defined(_MSC_VER)
311+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
312+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
309313

310-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
311-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
312-
313-
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
314+
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
314315

315-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
316-
ggml_fp16_internal_t tmp;
317-
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
318-
return (float)tmp;
319-
}
320-
321-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
322-
ggml_fp16_t res;
323-
ggml_fp16_internal_t tmp = f;
324-
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
325-
return res;
326-
}
316+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
317+
ggml_fp16_internal_t tmp;
318+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
319+
return (float)tmp;
320+
}
327321

328-
#else
322+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
323+
ggml_fp16_t res;
324+
ggml_fp16_internal_t tmp = f;
325+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
326+
return res;
327+
}
329328

330-
#ifdef __F16C__
329+
#elif defined(__F16C__)
331330

332-
#ifdef _MSC_VER
333-
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
334-
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
335-
#else
336-
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
337-
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
338-
#endif
331+
#ifdef _MSC_VER
332+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
333+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
334+
#else
335+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
336+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
337+
#endif
339338

340339
#elif defined(__POWER9_VECTOR__)
341340

342-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
343-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
344-
/* the inline asm below is about 12% faster than the lookup method */
345-
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
346-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
347-
348-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
349-
register float f;
350-
register double d;
351-
__asm__(
352-
"mtfprd %0,%2\n"
353-
"xscvhpdp %0,%0\n"
354-
"frsp %1,%0\n" :
355-
/* temp */ "=d"(d),
356-
/* out */ "=f"(f):
357-
/* in */ "r"(h));
358-
return f;
359-
}
341+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
342+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
343+
/* the inline asm below is about 12% faster than the lookup method */
344+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
345+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
346+
347+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
348+
register float f;
349+
register double d;
350+
__asm__(
351+
"mtfprd %0,%2\n"
352+
"xscvhpdp %0,%0\n"
353+
"frsp %1,%0\n" :
354+
/* temp */ "=d"(d),
355+
/* out */ "=f"(f):
356+
/* in */ "r"(h));
357+
return f;
358+
}
360359

361-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
362-
register double d;
363-
register ggml_fp16_t r;
364-
__asm__( /* xscvdphp can work on double or single precision */
365-
"xscvdphp %0,%2\n"
366-
"mffprd %1,%0\n" :
367-
/* temp */ "=d"(d),
368-
/* out */ "=r"(r):
369-
/* in */ "f"(f));
370-
return r;
371-
}
360+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
361+
register double d;
362+
register ggml_fp16_t r;
363+
__asm__( /* xscvdphp can work on double or single precision */
364+
"xscvdphp %0,%2\n"
365+
"mffprd %1,%0\n" :
366+
/* temp */ "=d"(d),
367+
/* out */ "=r"(r):
368+
/* in */ "f"(f));
369+
return r;
370+
}
372371

373372
#else
374373

375-
// FP16 <-> FP32
376-
// ref: https://github.com/Maratyszcza/FP16
377-
378-
static inline float fp32_from_bits(uint32_t w) {
379-
union {
380-
uint32_t as_bits;
381-
float as_value;
382-
} fp32;
383-
fp32.as_bits = w;
384-
return fp32.as_value;
385-
}
386-
387-
static inline uint32_t fp32_to_bits(float f) {
388-
union {
389-
float as_value;
390-
uint32_t as_bits;
391-
} fp32;
392-
fp32.as_value = f;
393-
return fp32.as_bits;
394-
}
374+
// FP16 <-> FP32
375+
// ref: https://github.com/Maratyszcza/FP16
395376

396-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
397-
const uint32_t w = (uint32_t) h << 16;
398-
const uint32_t sign = w & UINT32_C(0x80000000);
399-
const uint32_t two_w = w + w;
400-
401-
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
402-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
403-
const float exp_scale = 0x1.0p-112f;
404-
#else
405-
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
406-
#endif
407-
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
408-
409-
const uint32_t magic_mask = UINT32_C(126) << 23;
410-
const float magic_bias = 0.5f;
411-
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
377+
static inline float fp32_from_bits(uint32_t w) {
378+
union {
379+
uint32_t as_bits;
380+
float as_value;
381+
} fp32;
382+
fp32.as_bits = w;
383+
return fp32.as_value;
384+
}
412385

413-
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
414-
const uint32_t result = sign |
415-
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
416-
return fp32_from_bits(result);
417-
}
386+
static inline uint32_t fp32_to_bits(float f) {
387+
union {
388+
float as_value;
389+
uint32_t as_bits;
390+
} fp32;
391+
fp32.as_value = f;
392+
return fp32.as_bits;
393+
}
418394

419-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
420-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
421-
const float scale_to_inf = 0x1.0p+112f;
422-
const float scale_to_zero = 0x1.0p-110f;
423-
#else
424-
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
425-
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
426-
#endif
427-
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
428-
429-
const uint32_t w = fp32_to_bits(f);
430-
const uint32_t shl1_w = w + w;
431-
const uint32_t sign = w & UINT32_C(0x80000000);
432-
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
433-
if (bias < UINT32_C(0x71000000)) {
434-
bias = UINT32_C(0x71000000);
395+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
396+
const uint32_t w = (uint32_t) h << 16;
397+
const uint32_t sign = w & UINT32_C(0x80000000);
398+
const uint32_t two_w = w + w;
399+
400+
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
401+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
402+
const float exp_scale = 0x1.0p-112f;
403+
#else
404+
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
405+
#endif
406+
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
407+
408+
const uint32_t magic_mask = UINT32_C(126) << 23;
409+
const float magic_bias = 0.5f;
410+
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
411+
412+
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
413+
const uint32_t result = sign |
414+
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
415+
return fp32_from_bits(result);
435416
}
436417

437-
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
438-
const uint32_t bits = fp32_to_bits(base);
439-
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
440-
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
441-
const uint32_t nonsign = exp_bits + mantissa_bits;
442-
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
443-
}
418+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
419+
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
420+
const float scale_to_inf = 0x1.0p+112f;
421+
const float scale_to_zero = 0x1.0p-110f;
422+
#else
423+
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
424+
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
425+
#endif
426+
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
427+
428+
const uint32_t w = fp32_to_bits(f);
429+
const uint32_t shl1_w = w + w;
430+
const uint32_t sign = w & UINT32_C(0x80000000);
431+
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
432+
if (bias < UINT32_C(0x71000000)) {
433+
bias = UINT32_C(0x71000000);
434+
}
444435

445-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
446-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
436+
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
437+
const uint32_t bits = fp32_to_bits(base);
438+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
439+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
440+
const uint32_t nonsign = exp_bits + mantissa_bits;
441+
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
442+
}
447443

448-
#endif // __F16C__
444+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
445+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
449446

450447
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
451448

452-
#ifdef __ARM_FEATURE_SVE
453-
#include <arm_sve.h>
454-
#endif // __ARM_FEATURE_SVE
455449

456450
// precomputed f32 table for f16 (256 KB)
457451
// defined in ggml.c, initialized in ggml_init()

0 commit comments

Comments
 (0)