@@ -562,6 +562,68 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
562562 UNUSED (ncols_interleaved);
563563 UNUSED (blocklen);
564564
565+
566+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
567+ if (ggml_cpu_has_neon ()) {
568+ const void * b_ptr = vx;
569+ const void * a_ptr = vy;
570+ float * res_ptr = s;
571+
572+ __asm__ __volatile__ (
573+ " movi v31.16b, #0x4\n "
574+ " movi v30.16b, #0xf0\n "
575+ " add %x[b_ptr], %x[b_ptr], #0x8\n "
576+ " 1:" // Column loop
577+ " add x22, %x[a_ptr], #0x2\n "
578+ " movi v29.16b, #0x0\n "
579+ " mov x21, %x[nb]\n "
580+ " 2:" // Block loop
581+ " ldr q28, [%x[b_ptr], #0x0]\n "
582+ " ldr q27, [x22, #0x0]\n "
583+ " movi v26.4s, #0x0\n "
584+ " sub x20, x22, #0x2\n "
585+ " ldr q25, [x22, #0x10]\n "
586+ " ldr q24, [%x[b_ptr], #0x10]\n "
587+ " sub x21, x21, #0x1\n "
588+ " add x22, x22, #0x22\n "
589+ " ldr q23, [%x[b_ptr], #0x20]\n "
590+ " ldr q22, [%x[b_ptr], #0x30]\n "
591+ " ld1r { v21.8h }, [x20]\n "
592+ " ldr q20, [%x[b_ptr], #-0x8]\n "
593+ " sshl v16.16b, v28.16b, v31.16b\n "
594+ " and v28.16b, v28.16b, v30.16b\n "
595+ " sshl v19.16b, v24.16b, v31.16b\n "
596+ " and v24.16b, v24.16b, v30.16b\n "
597+ " add %x[b_ptr], %x[b_ptr], #0x48\n "
598+ " sshl v18.16b, v23.16b, v31.16b\n "
599+ " and v23.16b, v23.16b, v30.16b\n "
600+ " .inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n "
601+ " sshl v17.16b, v22.16b, v31.16b\n "
602+ " and v22.16b, v22.16b, v30.16b\n "
603+ " fcvtl v21.4s, v21.4h\n "
604+ " fcvtl v16.4s, v20.4h\n "
605+ " .inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n "
606+ " fmul v16.4s, v16.4s, v21.4s\n "
607+ " .inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n "
608+ " .inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n "
609+ " .inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n "
610+ " .inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n "
611+ " .inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n "
612+ " .inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n "
613+ " scvtf v26.4s, v26.4s, #0x4\n "
614+ " fmla v29.4s, v26.4s, v16.4s\n "
615+ " cbnz x21, 2b\n "
616+ " sub %x[nc], %x[nc], #0x4\n "
617+ " str q29, [%x[res_ptr], #0x0]\n "
618+ " add %x[res_ptr], %x[res_ptr], #0x10\n "
619+ " cbnz %x[nc], 1b\n "
620+ : [b_ptr] " +&r" (b_ptr), [res_ptr] " +&r" (res_ptr), [nc] " +&r" (nc)
621+ : [a_ptr] " r" (a_ptr), [nb] " r" (nb)
622+ : " memory" , " v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24" , " v25" , " v26" , " v27" , " v28" , " v29" , " v30" , " v31" , " x20" , " x21" , " x22"
623+ );
624+ return ;
625+ }
626+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
565627#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
566628 if (ggml_cpu_has_neon () && ggml_cpu_has_dotprod ()) {
567629 const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
@@ -1128,7 +1190,7 @@ static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
11281190 UNUSED (blocklen);
11291191
11301192#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1131- if (ggml_cpu_has_neon () && ggml_cpu_has_dotprod () ) {
1193+ if (ggml_cpu_has_neon ()) {
11321194 const void * b_ptr = vx;
11331195 const void * a_ptr = vy;
11341196 float * res_ptr = s;
@@ -4136,7 +4198,7 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
41364198 return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
41374199 }
41384200 }
4139- if (ggml_cpu_has_neon () && ggml_cpu_has_dotprod () ) {
4201+ if (ggml_cpu_has_neon ()) {
41404202 if (cur->ne [1 ] % 4 == 0 ) {
41414203 return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
41424204 }
0 commit comments