ggml-org · ggerganov · Nov 15, 2024 · Nov 15, 2024 · Nov 14, 2024 · Nov 15, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -92,6 +92,7 @@ else()
 endif()
 
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
@@ -116,6 +117,7 @@ endif()
 
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@@ -141,7 +143,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 
-option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@@ -238,12 +240,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
 
 if (BUILD_SHARED_LIBS)
-    install(TARGETS ggml LIBRARY)
+    install(TARGETS ggml      LIBRARY)
+    install(TARGETS ggml-base LIBRARY)
 endif()
 
+# FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
+    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
     install(
-        FILES src/ggml-metal.metal
+        FILES src/ggml-metal/ggml-metal.metal
         PERMISSIONS
             OWNER_READ
             OWNER_WRITE

diff --git a/include/ggml-amx.h b/include/ggml-amx.h
@@ -9,16 +9,16 @@ extern "C" {
 #endif
 
 // buffer_type API
-GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
 
-GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_amx_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
 
-GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
+GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
 
-GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
 
 #ifdef  __cplusplus
 }

diff --git a/include/ggml-backend.h b/include/ggml-backend.h
@@ -3,6 +3,20 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

diff --git a/include/ggml-blas.h b/include/ggml-blas.h
@@ -9,15 +9,15 @@ extern "C" {
 #endif
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
 
-GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
 
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 
-GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
 
 
 #ifdef  __cplusplus

diff --git a/include/ggml-cann.h b/include/ggml-cann.h
@@ -34,7 +34,7 @@ extern "C" {
  */
 #define GGML_CANN_MAX_DEVICES 16
 
-GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
 
 /**
  * @brief Initializes the CANN backend for a specified device.
@@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
  * @param device The index of the device to initialize.
  * @return A pointer to the initialized backend instance, or nullptr on failure.
  */
-GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 
 /**
  * @brief Checks if a given backend is a CANN backend.
@@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
  * @param backend The backend instance to check.
  * @return True if the backend is a CANN backend, false otherwise.
  */
-GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
 
 /**
  * @brief Retrieves the CANN buffer type for a specified device.
@@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
  * @return A pointer to the buffer type interface for the specified device, or
  * nullptr if the device index is out of range.
  */
-GGML_API ggml_backend_buffer_type_t
+GGML_BACKEND_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 
 /**
@@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
  *
  * @return The number of CANN devices available.
  */
-GGML_API int32_t ggml_backend_cann_get_device_count(void);
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
 
 /**
  * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
  *
  * @return A pointer to the host buffer type interface.
  */
-GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 
 /**
  * @brief Retrieves the description of a specific CANN device.
@@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
  * @param description Pointer to a buffer where the description will be written.
  * @param description_size Size of the description buffer.
  */
-GGML_API void ggml_backend_cann_get_device_description(
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
     int32_t device, char* description, size_t description_size);
 
 /**
@@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
  * @param total Pointer to a variable where the total memory size will be
  * stored.
  */
-GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                   size_t* free,
                                                   size_t* total);
 

diff --git a/include/ggml-cpu.h b/include/ggml-cpu.h
@@ -54,54 +54,77 @@ extern "C" {
         GGML_NUMA_STRATEGY_COUNT
     };
 
-    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
 
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
 
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 
-    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
-    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
-    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
                   const struct ggml_cgraph * cgraph,
                                        int   n_threads, /* = GGML_DEFAULT_N_THREADS */
                     struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 
-    // TODO: move to backend interface
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_sve        (void);
-    GGML_API int ggml_cpu_has_matmul_int8(void);
-    // get the sve vector length in bytes
-    GGML_API int ggml_cpu_get_sve_cnt(void);
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
     // Internal types and functions exposed for tests and benchmarks
 
@@ -115,6 +138,7 @@ extern "C" {
                                        const void * GGML_RESTRICT y, int nr, int nc);
 
     struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
         ggml_from_float_to_mat_t from_float_to_mat;
         ggml_vec_dot_t           vec_dot;
         enum ggml_type           vec_dot_type;
@@ -124,27 +148,30 @@ extern "C" {
         ggml_gemm_t              gemm;
     };
 
-    GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
 
-    GGML_API void ggml_cpu_init(void);
+    GGML_BACKEND_API void ggml_cpu_init(void);
 
     //
     // CPU backend
     //
 
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
-    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
 #ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 
+    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
+    GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/ggml-cuda.h b/include/ggml-cuda.h
@@ -7,7 +7,7 @@
 extern "C" {
 #endif
 
-#ifdef GGML_USE_HIPBLAS
+#ifdef GGML_USE_HIP
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
@@ -20,27 +20,27 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
 
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 
 // device buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
 
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
-GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 
-GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 
-GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 
 #ifdef  __cplusplus
 }