@@ -2239,36 +2239,44 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
22392239}
22402240#endif
22412241
2242- #ifdef GGML_USE_RUNTIME_REPACK
2243-
22442242// buffer type AARCH64
22452243
2244+ #ifdef __GNUC__
2245+ #pragma GCC diagnostic push
2246+ #pragma GCC diagnostic ignored "-Wpedantic"
2247+ #endif
2248+
22462249#include " ggml-aarch64.h"
22472250
2251+ #ifdef __GNUC__
2252+ #pragma GCC diagnostic pop
2253+ #endif
2254+
2255+ static void ggml_backend_cpu_aarch64_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2256+ tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type (tensor); // NOLINT
2257+
2258+ GGML_UNUSED (buffer);
2259+ }
2260+
22482261static void ggml_backend_cpu_aarch64_buffer_set_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2249- bool quantize = tensor->type == GGML_TYPE_Q4_0 &&
2250- tensor->op == GGML_OP_NONE &&
2251- strcmp (tensor->name , " token_embd.weight" ) != 0 ;
2262+ GGML_ASSERT (offset == 0 );
2263+ GGML_ASSERT (size == ggml_nbytes (tensor));
22522264
2253- if (quantize) {
2254- GGML_ASSERT (offset == 0 );
2255- if (ggml_prepare_optimal_kernel (tensor, data, size) == 0 ) {
2256- return ;
2257- }
2258- }
2259- memcpy ((char *)tensor->data + offset, data, size);
2265+ enum ggml_type repack_type = (enum ggml_type)(intptr_t )tensor->extra ;
2266+
2267+ ggml_aarch64_repack_tensor (tensor, repack_type, data, size);
22602268
22612269 GGML_UNUSED (buffer);
22622270}
22632271
22642272static const struct ggml_backend_buffer_i ggml_backend_cpu_aarch64_buffer_i = {
22652273 /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
22662274 /* .get_base = */ ggml_backend_cpu_buffer_get_base,
2267- /* .init_tensor = */ NULL , // no initialization required
2275+ /* .init_tensor = */ ggml_backend_cpu_aarch64_buffer_init_tensor,
22682276 /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
22692277 /* .set_tensor = */ ggml_backend_cpu_aarch64_buffer_set_tensor,
2270- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor ,
2271- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor ,
2278+ /* .get_tensor = */ NULL ,
2279+ /* .cpy_tensor = */ NULL ,
22722280 /* .clear = */ ggml_backend_cpu_buffer_clear,
22732281 /* .reset = */ NULL ,
22742282};
@@ -2298,33 +2306,37 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
22982306 /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
22992307 /* .get_max_size = */ NULL , // defaults to SIZE_MAX
23002308 /* .get_alloc_size = */ NULL , // defaults to ggml_nbytes
2301- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host ,
2309+ /* .is_host = */ NULL ,
23022310 },
23032311 /* .device = */ ggml_backend_reg_dev_get (ggml_backend_cpu_reg (), 0 ),
2304- /* .context = */ NULL ,
2312+ /* .context = */ NULL ,
23052313 };
23062314
23072315 return &ggml_backend_cpu_buffer_type_aarch64;
23082316}
2309- #endif
2317+
2318+ bool ggml_backend_cpu_buft_is_aarch64 (ggml_backend_buffer_type_t buft) {
2319+ return buft == ggml_backend_cpu_aarch64_buffer_type ();
2320+ }
23102321
23112322static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts (ggml_backend_dev_t device) {
2312- static ggml_backend_buffer_type_t bufts[ 3 ];
2313- int index = 0 ;
2323+ static std::vector< ggml_backend_buffer_type_t > bufts = []() {
2324+ std::vector< ggml_backend_buffer_type_t > bufts ;
23142325
23152326#ifdef GGML_USE_CPU_HBM
2316- bufts[index++] = ggml_backend_cpu_hbm_buffer_type ();
2327+ bufts. push_back ( ggml_backend_cpu_hbm_buffer_type () );
23172328#endif
23182329
2319- #ifdef GGML_USE_RUNTIME_REPACK
2320- if (ggml_cpu_has_neon () || ggml_cpu_has_matmul_int8 () || ggml_cpu_has_sve ()) {
2321- bufts[index++] = ggml_backend_cpu_aarch64_buffer_type ();
2322- }
2330+ #ifdef GGML_USE_CPU_AARCH64
2331+ bufts.push_back (ggml_backend_cpu_aarch64_buffer_type ());
23232332#endif
23242333
2325- bufts[index] = NULL ; // Terminate the list
2334+ bufts.push_back (NULL );
2335+
2336+ return bufts;
2337+ }();
23262338
2327- return bufts;
2339+ return bufts. data () ;
23282340
23292341 GGML_UNUSED (device);
23302342}
@@ -2635,15 +2647,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
26352647}
26362648
26372649static bool ggml_backend_cpu_device_supports_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
2638- #ifdef GGML_USE_RUNTIME_REPACK
2639- const struct ggml_tensor *tensor = op->src [0 ];
2640- if (tensor && tensor->buffer && (strcmp (tensor->buffer ->buft ->iface .get_name (tensor->buffer ->buft )," CPU_AARCH64" ) == 0 )) {
2641- if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) {
2642- return op->src [1 ]->type == GGML_TYPE_F32 || op->src [1 ]->type == ggml_get_type_traits_cpu (tensor->type )->vec_dot_type ;
2650+ const struct ggml_tensor * src0 = op->src [0 ];
2651+ const struct ggml_tensor * src1 = op->src [1 ];
2652+
2653+ if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64 (src0->buffer ->buft )) {
2654+ if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type (src0) == GGML_TYPE_Q4_0) {
2655+ return false ;
26432656 }
2644- return false ;
26452657 }
2646- #endif
2658+
2659+ for (int i = 1 ; i < GGML_MAX_SRC; i++) {
2660+ if (op->src [i] && op->src [i]->buffer && ggml_backend_cpu_buft_is_aarch64 (op->src [i]->buffer ->buft )) {
2661+ return false ;
2662+ }
2663+ }
2664+
26472665 switch (op->op ) {
26482666 case GGML_OP_CPY:
26492667 return
@@ -2652,13 +2670,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
26522670 op->type != GGML_TYPE_IQ1_S &&
26532671 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
26542672 case GGML_OP_MUL_MAT:
2655- return op-> src [ 1 ]-> type == GGML_TYPE_F32; // FIXME || op->src[1]-> type == ggml_get_type_traits(op->src[0] ->type)->vec_dot_type;
2673+ return src1-> type == GGML_TYPE_F32 || src1-> type == ggml_get_type_traits_cpu (src0 ->type )->vec_dot_type ;
26562674 case GGML_OP_ROPE_BACK:
26572675 return op->src [2 ] == NULL && (op->op_params [2 ] & 4 ) == 0 ;
26582676 case GGML_OP_IM2COL_BACK:
2659- return op-> src [ 0 ]-> type == GGML_TYPE_F32 && op-> src [ 1 ] ->type == GGML_TYPE_F32;
2677+ return src0-> type == GGML_TYPE_F32 && src1 ->type == GGML_TYPE_F32;
26602678 case GGML_OP_OUT_PROD:
2661- return (op-> src [ 0 ]-> type == GGML_TYPE_F32 || ggml_is_quantized (op-> src [ 0 ]-> type )) && op-> src [ 1 ] ->type == GGML_TYPE_F32;
2679+ return (src0-> type == GGML_TYPE_F32 || ggml_is_quantized (src0-> type )) && src1 ->type == GGML_TYPE_F32;
26622680 default :
26632681 return true ;
26642682 }
@@ -2667,7 +2685,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
26672685}
26682686
26692687static bool ggml_backend_cpu_device_supports_buft (ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2670- return ggml_backend_buft_is_host (buft);
2688+ return ggml_backend_buft_is_host (buft) || ggml_backend_cpu_buft_is_aarch64 (buft) ;
26712689
26722690 GGML_UNUSED (dev);
26732691}
@@ -2721,7 +2739,7 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
27212739 if (strcmp (name, " ggml_backend_set_n_threads" ) == 0 ) {
27222740 return (void *)ggml_backend_cpu_set_n_threads;
27232741 }
2724- if (strcmp (name, " ggml_backend_cpu_get_extra_bufts " ) == 0 ) {
2742+ if (strcmp (name, " ggml_backend_dev_get_extra_bufts " ) == 0 ) {
27252743 return (void *)ggml_backend_cpu_get_extra_bufts;
27262744 }
27272745
@@ -2738,6 +2756,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
27382756};
27392757
27402758ggml_backend_reg_t ggml_backend_cpu_reg (void ) {
2759+ // init CPU feature detection
2760+ ggml_cpu_init ();
2761+
27412762 static struct ggml_backend_reg ggml_backend_cpu_reg = {
27422763 /* .iface = */ ggml_backend_cpu_reg_i,
27432764 /* .context = */ NULL ,
0 commit comments