@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
33873387}
33883388
33893389// FIXME: this code is duplicated from ggml-aarch64.c
3390- static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3390+ static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave ) {
33913391 block_q4_0x4 out ;
33923392
33933393 for (int i = 0 ; i < 4 ; i ++ ) {
33943394 out .d [i ] = in [i ].d ;
33953395 }
33963396
3397- for (int i = 0 ; i < QK4_0 * 2 ; i ++ ) {
3398- int src_offset = (i / (4 * blck_size_interleave )) * blck_size_interleave ;
3399- int src_id = (i % (4 * blck_size_interleave )) / blck_size_interleave ;
3400- src_offset += (i % blck_size_interleave );
3397+ const int end = QK4_0 * 2 / blck_size_interleave ;
34013398
3402- out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3399+ if (blck_size_interleave == 8 ) {
3400+ const uint64_t xor_mask = 0x8888888888888888ULL ;
3401+ for (int i = 0 ; i < end ; ++ i ) {
3402+ int src_id = i % 4 ;
3403+ int src_offset = (i / 4 ) * blck_size_interleave ;
3404+ int dst_offset = i * blck_size_interleave ;
3405+
3406+ uint64_t elems ;
3407+ // Using memcpy to avoid unaligned memory accesses
3408+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint64_t ));
3409+ elems ^= xor_mask ;
3410+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint64_t ));
3411+ }
3412+ } else if (blck_size_interleave == 4 ) {
3413+ const uint32_t xor_mask = 0x88888888 ;
3414+ for (int i = 0 ; i < end ; ++ i ) {
3415+ int src_id = i % 4 ;
3416+ int src_offset = (i / 4 ) * blck_size_interleave ;
3417+ int dst_offset = i * blck_size_interleave ;
3418+
3419+ uint32_t elems ;
3420+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint32_t ));
3421+ elems ^= xor_mask ;
3422+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint32_t ));
3423+ }
3424+ } else {
3425+ GGML_ASSERT (false);
34033426 }
34043427
34053428 return out ;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
34093432// returns an interleaved block_q4_0x8
34103433// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
34113434// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412- static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3435+ static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave ) {
34133436 block_q4_0x8 out ;
34143437
34153438 for (int i = 0 ; i < 8 ; i ++ ) {
34163439 out .d [i ] = in [i ].d ;
34173440 }
34183441
3419- for (int i = 0 ; i < QK4_0 * 4 ; i ++ ) {
3420- int src_offset = (i / (8 * blck_size_interleave )) * blck_size_interleave ;
3421- int src_id = (i % (8 * blck_size_interleave )) / blck_size_interleave ;
3422- src_offset += (i % blck_size_interleave );
3442+ const int end = QK4_0 * 4 / blck_size_interleave ;
3443+ const uint64_t xor_mask = 0x8888888888888888ULL ;
3444+
3445+ for (int i = 0 ; i < end ; ++ i ) {
3446+ int src_id = i % 8 ;
3447+ int src_offset = (i / 8 ) * blck_size_interleave ;
3448+ int dst_offset = i * blck_size_interleave ;
34233449
3424- out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3450+ uint64_t elems ;
3451+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint64_t ));
3452+ elems ^= xor_mask ;
3453+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint64_t ));
34253454 }
34263455
34273456 return out ;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
34493478 for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
34503479 dst_tmp [i ] = src [x + i * nblocks ];
34513480 }
3452- * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3481+ * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block );
34533482 }
34543483 src += nrows_interleaved * nblocks ;
34553484 }
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
34803509 for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
34813510 dst_tmp [i ] = src [x + i * nblocks ];
34823511 }
3483- * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3512+ * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block );
34843513 }
34853514 src += nrows_interleaved * nblocks ;
34863515 }
0 commit comments