@@ -3385,3 +3385,147 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
33853385 }
33863386 }
33873387}
3388+
3389+ // FIXME: this code is duplicated from ggml-aarch64.c
3390+ static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3391+ block_q4_0x4 out ;
3392+
3393+ for (int i = 0 ; i < 4 ; i ++ ) {
3394+ out .d [i ] = in [i ].d ;
3395+ }
3396+
3397+ for (int i = 0 ; i < QK4_0 * 2 ; i ++ ) {
3398+ int src_offset = (i / (4 * blck_size_interleave )) * blck_size_interleave ;
3399+ int src_id = (i % (4 * blck_size_interleave )) / blck_size_interleave ;
3400+ src_offset += (i % blck_size_interleave );
3401+
3402+ out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3403+ }
3404+
3405+ return out ;
3406+ }
3407+
3408+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
3409+ // returns an interleaved block_q4_0x8
3410+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3411+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412+ static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3413+ block_q4_0x8 out ;
3414+
3415+ for (int i = 0 ; i < 8 ; i ++ ) {
3416+ out .d [i ] = in [i ].d ;
3417+ }
3418+
3419+ for (int i = 0 ; i < QK4_0 * 4 ; i ++ ) {
3420+ int src_offset = (i / (8 * blck_size_interleave )) * blck_size_interleave ;
3421+ int src_id = (i % (8 * blck_size_interleave )) / blck_size_interleave ;
3422+ src_offset += (i % blck_size_interleave );
3423+
3424+ out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3425+ }
3426+
3427+ return out ;
3428+ }
3429+
3430+ static int repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , const void * restrict data , size_t data_size ) {
3431+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3432+ GGML_ASSERT (interleave_block == 4 || interleave_block == 8 );
3433+
3434+ block_q4_0x4 * dst = (block_q4_0x4 * )t -> data ;
3435+ const block_q4_0 * src = (const block_q4_0 * )data ;
3436+ block_q4_0 dst_tmp [4 ];
3437+ int nrow = t -> ne [1 ]; // Number of rows
3438+ int nrows_interleaved = 4 ;
3439+ int nblocks = t -> ne [0 ] / QK4_0 ;
3440+
3441+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3442+
3443+ if (nrow % nrows_interleaved != 0 || t -> ne [0 ] % 8 != 0 ) {
3444+ return -1 ;
3445+ }
3446+
3447+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
3448+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3449+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3450+ dst_tmp [i ] = src [x + i * nblocks ];
3451+ }
3452+ * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3453+ }
3454+ src += nrows_interleaved * nblocks ;
3455+ }
3456+ return 0 ;
3457+
3458+ GGML_UNUSED (data_size );
3459+ }
3460+
3461+ static int repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , const void * restrict data , size_t data_size ) {
3462+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3463+ GGML_ASSERT (interleave_block == 8 );
3464+
3465+ block_q4_0x8 * dst = (block_q4_0x8 * )t -> data ;
3466+ const block_q4_0 * src = (const block_q4_0 * ) data ;
3467+ block_q4_0 dst_tmp [8 ];
3468+ int nrow = t -> ne [1 ]; // Number of rows
3469+ int nrows_interleaved = 8 ;
3470+ int nblocks = t -> ne [0 ] / QK4_0 ;
3471+
3472+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3473+
3474+ if (nrow % nrows_interleaved != 0 || t -> ne [0 ] % 8 != 0 ) {
3475+ return -1 ;
3476+ }
3477+
3478+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
3479+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3480+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3481+ dst_tmp [i ] = src [x + i * nblocks ];
3482+ }
3483+ * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3484+ }
3485+ src += nrows_interleaved * nblocks ;
3486+ }
3487+ return 0 ;
3488+
3489+ GGML_UNUSED (data_size );
3490+ }
3491+
3492+ // Prepare for optimized kernels if applicable
3493+ void ggml_aarch64_repack_tensor (struct ggml_tensor * cur , enum ggml_type repack_type , const void * restrict data , size_t data_size ) {
3494+ if (cur -> type == repack_type ) {
3495+ memcpy (cur -> data , data , data_size );
3496+ return ;
3497+ }
3498+
3499+ GGML_ASSERT (cur -> type == GGML_TYPE_Q4_0 );
3500+
3501+ switch (repack_type ) {
3502+ case GGML_TYPE_Q4_0_8_8 :
3503+ repack_q4_0_to_q4_0_8_bl (cur , 8 , data , data_size );
3504+ break ;
3505+ case GGML_TYPE_Q4_0_4_8 :
3506+ repack_q4_0_to_q4_0_4_bl (cur , 8 , data , data_size );
3507+ break ;
3508+ case GGML_TYPE_Q4_0_4_4 :
3509+ repack_q4_0_to_q4_0_4_bl (cur , 4 , data , data_size );
3510+ break ;
3511+ default :
3512+ GGML_ABORT ("Unsupported type" );
3513+ }
3514+ }
3515+
3516+ enum ggml_type ggml_aarch64_get_optimal_repack_type (const struct ggml_tensor * cur ) {
3517+ if (cur -> type == GGML_TYPE_Q4_0 ) {
3518+ // TODO: enable for AVX2 - currently disabled due to bad gemv performance
3519+ if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 )) {
3520+ return GGML_TYPE_Q4_0_8_8 ;
3521+ }
3522+ if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3523+ return GGML_TYPE_Q4_0_4_8 ;
3524+ }
3525+ if (ggml_cpu_has_neon ()) {
3526+ return GGML_TYPE_Q4_0_4_4 ;
3527+ }
3528+ }
3529+
3530+ return cur -> type ;
3531+ }
0 commit comments