Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions src/ggml-cpu/ggml-cpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

#include "ggml.h"
#include "ggml-impl.h"

#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
//#include <stddef.h>
#include <stdbool.h>
#include <string.h> // memcpy
#include <math.h> // fabsf


#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -69,33 +69,16 @@ struct ggml_compute_params {
#endif

#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
#endif

// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON)

// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>

// ref: https://github.com/ggml-org/llama.cpp/pull/5404
#ifdef _MSC_VER

typedef uint16_t ggml_fp16_internal_t;

#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }

#else

typedef __fp16 ggml_fp16_internal_t;

#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }

#endif // _MSC_VER

#if !defined(__aarch64__)
Expand Down
8 changes: 4 additions & 4 deletions src/ggml-cpu/simd-mappings.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
#define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
#define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
Expand Down Expand Up @@ -99,7 +99,7 @@
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
Expand All @@ -114,7 +114,7 @@
#define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32
Expand All @@ -125,7 +125,7 @@
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
Expand Down
19 changes: 7 additions & 12 deletions src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE

#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
#if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
Expand Down Expand Up @@ -311,29 +311,24 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);

// FP16 to FP32 conversion

// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON)
Copy link
Collaborator

@cmdr2 cmdr2 Apr 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't there be a corresponding && !defined(__MSC_VER) here? Since we're only removing the CUDA check

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if MSVC supports __fp16, we don't need this check. But maybe it was added because it does not support it. If that's the case, we'll have to bring it back and fallback to the reference implementation.

Copy link
Collaborator

@cmdr2 cmdr2 Apr 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if MSVC supports __fp16, we don't need this check.

Yeah true. The original PR said: can't use native __fp16 type as it's not supported by MSVC - ggml-org/llama.cpp#3007

Maybe it has changed since that PR, but I can't find anything to this on the internet.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, this explains it.

But on the other hand, we have a MSVC+Arm CI job in llama.cpp that appears to be passing with this change:

https://github.com/ggml-org/llama.cpp/actions/runs/14262095042/job/39975786169

So maybe it was indeed fixed?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. This could be merged in, and if there are any reports of regression on MSVC+Arm Neon, then it's easy to fix it back.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MSVC cannot be used to build for Arm, the CPU backend does not allow it.

This is because of the issue with the Arm inline assembly that is not supported by the MSVC compiler, correct?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Support for Arm MSVC was removed when fixing the CPU feature check with GGML_NATIVE in cmake, since it would require writing a specific implementation for it. You could still build with MSVC without the inline assembly kernels before that, it would just be slower. Since Clang is always available and shipped with Visual Studio, there was no reason to spend time on that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. So I think it that case, this PR should be good to merge?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, I don't have the full context of why this code exists in this way.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did some additional digging and found the need for the CUDA and MUSA checks. They do seem to be necessary so I added comments in the code and updated the OP to try to make things a bit more clear. This PR now just removes the ggml_fp16_internal_t type which is now redundant and cleans up some macros and include logic.

Let me know if you see any red flags.

#if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
typedef uint16_t ggml_fp16_internal_t;
#else
typedef __fp16 ggml_fp16_internal_t;
#endif
#endif

#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)

static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
__fp16 tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}

static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
__fp16 tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
Expand Down Expand Up @@ -485,7 +480,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
#endif // defined(__ARM_NEON)

// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
Expand Down