@@ -3477,101 +3477,87 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
34773477 }
34783478}
34793479
3480- static int repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3480+ #ifdef GGML_USE_CPU_AARCH64
3481+ static void repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , const void * data , size_t data_size ) {
34813482 GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
34823483 GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
34833484 GGML_ASSERT (interleave_block == 4 || interleave_block == 8 );
34843485
3485- // Do in-place transformation. Allocate scratch buffer
3486- size_t size = sizeof (block_q4_0x4 ) * t -> ne [0 ] / QK4_0 ;
3487- if (size > * psize ) {
3488- uint8_t * new_mem = realloc (* pmem , size );
3489- if (!new_mem ) {
3490- return -1 ;
3491- }
3492- * pmem = new_mem ;
3493- * psize = size ;
3494- }
3495- block_q4_0x4 * dst = (block_q4_0x4 * ) * pmem ;
3496- block_q4_0 * src = (block_q4_0 * ) t -> data ;
3486+ block_q4_0x4 * dst = (block_q4_0x4 * )t -> data ;
3487+ const block_q4_0 * src = (const block_q4_0 * )data ;
34973488 block_q4_0 dst_tmp [4 ];
3498- int n = t -> ne [0 ];
34993489 int nrow = t -> ne [1 ]; // Number of rows
35003490 int nrows_interleaved = 4 ;
35013491 int nblocks = t -> ne [0 ] / QK4_0 ;
3502- for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3503- int cnt = 0 ;
3504- for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3505- for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3492+
3493+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3494+
3495+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
3496+ for (int64_t x = 0 ; x < nblocks ; x ++ )
3497+ {
3498+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
35063499 dst_tmp [i ] = src [x + i * nblocks ];
35073500 }
3508- dst [ cnt ++ ] = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3501+ * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
35093502 }
3510- memcpy (src , dst , size );
3511- src += cnt * 4 ;
3503+ src += nrows_interleaved * nblocks ;
35123504 }
3513- return 0 ;
3505+
3506+ GGML_UNUSED (data_size );
35143507}
35153508
3516- static int repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3509+ static void repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , const void * data , size_t data_size ) {
35173510 GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
35183511 GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
35193512 GGML_ASSERT (interleave_block == 8 );
35203513
3521- // Do in-place transformation. Allocate scratch buffer
3522- size_t size = sizeof (block_q4_0x8 ) * t -> ne [0 ] / QK4_0 ;
3523- if (size > * psize ) {
3524- uint8_t * new_mem = realloc (* pmem , size );
3525- if (!new_mem ) {
3526- return -1 ;
3527- }
3528- * pmem = new_mem ;
3529- * psize = size ;
3530- }
3531- block_q4_0x8 * dst = (block_q4_0x8 * ) * pmem ;
3532- block_q4_0 * src = (block_q4_0 * ) t -> data ;
3514+ block_q4_0x8 * dst = (block_q4_0x8 * )t -> data ;
3515+ const block_q4_0 * src = (const block_q4_0 * ) data ;
35333516 block_q4_0 dst_tmp [8 ];
3534- int n = t -> ne [0 ];
35353517 int nrow = t -> ne [1 ]; // Number of rows
35363518 int nrows_interleaved = 8 ;
35373519 int nblocks = t -> ne [0 ] / QK4_0 ;
3538- for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3539- int cnt = 0 ;
3520+
3521+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3522+
3523+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
35403524 for (int64_t x = 0 ; x < nblocks ; x ++ ) {
35413525 for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
35423526 dst_tmp [i ] = src [x + i * nblocks ];
35433527 }
3544- dst [ cnt ++ ] = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3528+ * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
35453529 }
3546- memcpy (src , dst , size );
3547- src += cnt * 4 ;
3530+ src += nrows_interleaved * nblocks ;
35483531 }
3549- return 0 ;
3532+
3533+ GGML_UNUSED (data_size );
35503534}
35513535
35523536// Prepare for optimized kernels if applicable
3553- void ggml_prepare_optimal_kernel (struct ggml_tensor * cur , uint8_t * * pmem , size_t * psize ) {
3554- UNUSED (cur );
3555- UNUSED (pmem );
3556- UNUSED (psize );
3557-
3537+ int ggml_prepare_optimal_kernel (struct ggml_tensor * cur , const void * data , size_t data_size ) {
3538+ GGML_ASSERT (cur -> type == GGML_TYPE_Q4_0 );
3539+ int ret = -1 ;
35583540#if defined(__ARM_ARCH )
3559- if (cur -> type == GGML_TYPE_Q4_0 ) {
3560- if (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 ) {
3561- if (repack_q4_0_to_q4_0_8_bl (cur , 8 , pmem , psize ) == 0 ) {
3562- cur -> type = GGML_TYPE_Q4_0_8_8 ;
3563- }
3564- }
3565- else if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3566- if (repack_q4_0_to_q4_0_4_bl (cur , 8 , pmem , psize ) == 0 ) {
3567- cur -> type = GGML_TYPE_Q4_0_4_8 ;
3568- }
3569- }
3570- else if (ggml_cpu_has_neon ()) {
3571- if (repack_q4_0_to_q4_0_4_bl (cur , 4 , pmem , psize ) == 0 ) {
3572- cur -> type = GGML_TYPE_Q4_0_4_4 ;
3573- }
3574- }
3541+ if (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 ) {
3542+ repack_q4_0_to_q4_0_8_bl (cur , 8 , data , data_size );
3543+ cur -> type = GGML_TYPE_Q4_0_8_8 ;
3544+ ret = 0 ;
3545+ }
3546+ else if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3547+ repack_q4_0_to_q4_0_4_bl (cur , 8 , data , data_size );
3548+ cur -> type = GGML_TYPE_Q4_0_4_8 ;
3549+ ret = 0 ;
3550+ }
3551+ else if (ggml_cpu_has_neon ()) {
3552+ repack_q4_0_to_q4_0_4_bl (cur , 4 , data , data_size );
3553+ cur -> type = GGML_TYPE_Q4_0_4_4 ;
3554+ ret = 0 ;
35753555 }
35763556#endif
3557+ return ret ;
3558+
3559+ GGML_UNUSED (cur );
3560+ GGML_UNUSED (data );
3561+ GGML_UNUSED (data_size );
35773562}
3563+ #endif
0 commit comments