@@ -29,6 +29,20 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32
2929}
3030
3131static __device__ __forceinline__ int2 get_int_from_table_16 (const int & q4, const int8_t * table) {
32+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
33+ uint32_t v1, v2, v3, v4, mask;
34+ const uint32_t * values = (const uint32_t *)table;
35+
36+ mask = (0x32103210 | ((q4 & 0x88888888 ) >> 1 ));
37+ v1 = __byte_perm (values[0 ], values[1 ], q4);
38+ v2 = __byte_perm (values[2 ], values[3 ], q4);
39+ v3 = __byte_perm (v1, v2, mask);
40+ v1 = __byte_perm (values[0 ], values[1 ], q4 >> 16 );
41+ v2 = __byte_perm (values[2 ], values[3 ], q4 >> 16 );
42+ v4 = __byte_perm (v1, v2, mask >> 16 );
43+
44+ return make_int2 (__byte_perm (v3, v4, 0x6420 ), __byte_perm (v3, v4, 0x7531 ));
45+ #else
3246 const int q0_32 = (q4 >> 0 ) & 0x0F0F0F0F ;
3347 const int8_t * q0_8 = (const int8_t *) &q0_32;
3448 const char4 val0_8 = make_char4 (
@@ -40,6 +54,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
4054 table[q1_8[0 ]], table[q1_8[1 ]], table[q1_8[2 ]], table[q1_8[3 ]]);
4155
4256 return make_int2 (*((const int *) &val0_8), *((const int *) &val1_8));
57+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
4358}
4459
4560// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
0 commit comments