@@ -3476,3 +3476,102 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
34763476 }
34773477 }
34783478}
3479+
3480+ static int repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3481+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3482+ GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
3483+ GGML_ASSERT (interleave_block == 4 || interleave_block == 8 );
3484+
3485+ // Do in-place transformation. Allocate scratch buffer
3486+ size_t size = sizeof (block_q4_0x4 ) * t -> ne [0 ] / QK4_0 ;
3487+ if (size > * psize ) {
3488+ uint8_t * new_mem = realloc (* pmem , size );
3489+ if (!new_mem ) {
3490+ return -1 ;
3491+ }
3492+ * pmem = new_mem ;
3493+ * psize = size ;
3494+ }
3495+ block_q4_0x4 * dst = (block_q4_0x4 * ) * pmem ;
3496+ block_q4_0 * src = (block_q4_0 * ) t -> data ;
3497+ block_q4_0 dst_tmp [4 ];
3498+ int n = t -> ne [0 ];
3499+ int nrow = t -> ne [1 ]; // Number of rows
3500+ int nrows_interleaved = 4 ;
3501+ int nblocks = t -> ne [0 ] / QK4_0 ;
3502+ for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3503+ int cnt = 0 ;
3504+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3505+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3506+ dst_tmp [i ] = src [x + i * nblocks ];
3507+ }
3508+ dst [cnt ++ ] = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3509+ }
3510+ memcpy (src , dst , size );
3511+ src += cnt * 4 ;
3512+ }
3513+ return 0 ;
3514+ }
3515+
3516+ static int repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3517+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3518+ GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
3519+ GGML_ASSERT (interleave_block == 8 );
3520+
3521+ // Do in-place transformation. Allocate scratch buffer
3522+ size_t size = sizeof (block_q4_0x8 ) * t -> ne [0 ] / QK4_0 ;
3523+ if (size > * psize ) {
3524+ uint8_t * new_mem = realloc (* pmem , size );
3525+ if (!new_mem ) {
3526+ return -1 ;
3527+ }
3528+ * pmem = new_mem ;
3529+ * psize = size ;
3530+ }
3531+ block_q4_0x8 * dst = (block_q4_0x8 * ) * pmem ;
3532+ block_q4_0 * src = (block_q4_0 * ) t -> data ;
3533+ block_q4_0 dst_tmp [8 ];
3534+ int n = t -> ne [0 ];
3535+ int nrow = t -> ne [1 ]; // Number of rows
3536+ int nrows_interleaved = 8 ;
3537+ int nblocks = t -> ne [0 ] / QK4_0 ;
3538+ for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3539+ int cnt = 0 ;
3540+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3541+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3542+ dst_tmp [i ] = src [x + i * nblocks ];
3543+ }
3544+ dst [cnt ++ ] = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3545+ }
3546+ memcpy (src , dst , size );
3547+ src += cnt * 4 ;
3548+ }
3549+ return 0 ;
3550+ }
3551+
3552+ // Prepare for optimized kernels if applicable
3553+ void ggml_prepare_optimal_kernel (struct ggml_tensor * cur , uint8_t * * pmem , size_t * psize ) {
3554+ UNUSED (cur );
3555+ UNUSED (pmem );
3556+ UNUSED (psize );
3557+
3558+ #if defined(__ARM_ARCH )
3559+ if (cur -> type == GGML_TYPE_Q4_0 ) {
3560+ if (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 ) {
3561+ if (repack_q4_0_to_q4_0_8_bl (cur , 8 , pmem , psize ) == 0 ) {
3562+ cur -> type = GGML_TYPE_Q4_0_8_8 ;
3563+ }
3564+ }
3565+ else if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3566+ if (repack_q4_0_to_q4_0_4_bl (cur , 8 , pmem , psize ) == 0 ) {
3567+ cur -> type = GGML_TYPE_Q4_0_4_8 ;
3568+ }
3569+ }
3570+ else if (ggml_cpu_has_neon ()) {
3571+ if (repack_q4_0_to_q4_0_4_bl (cur , 4 , pmem , psize ) == 0 ) {
3572+ cur -> type = GGML_TYPE_Q4_0_4_4 ;
3573+ }
3574+ }
3575+ }
3576+ #endif
3577+ }
0 commit comments