Skip to content

Commit bd4b6cd

Browse files
committed
CUDA: optimize get_int_from_table_16
1 parent a094f38 commit bd4b6cd

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

ggml/src/ggml-cuda/vecdotq.cuh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,20 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32
2929
}
3030

3131
static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, const int8_t * table) {
32+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
33+
uint32_t v1, v2, v3, v4, mask;
34+
const uint32_t * values = (const uint32_t *)table;
35+
36+
mask = (0x32103210 | ((q4 & 0x88888888) >> 1));
37+
v1 = __byte_perm(values[0], values[1], q4);
38+
v2 = __byte_perm(values[2], values[3], q4);
39+
v3 = __byte_perm(v1, v2, mask);
40+
v1 = __byte_perm(values[0], values[1], q4 >> 16);
41+
v2 = __byte_perm(values[2], values[3], q4 >> 16);
42+
v4 = __byte_perm(v1, v2, mask >> 16);
43+
44+
return make_int2(__byte_perm(v3, v4, 0x6420), __byte_perm(v3, v4, 0x7531));
45+
#else
3246
const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
3347
const int8_t * q0_8 = (const int8_t *) &q0_32;
3448
const char4 val0_8 = make_char4(
@@ -40,6 +54,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
4054
table[q1_8[0]], table[q1_8[1]], table[q1_8[2]], table[q1_8[3]]);
4155

4256
return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
57+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
4358
}
4459

4560
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called

0 commit comments

Comments
 (0)