CUDA: optimize get_int_from_table_16

Qweewee · Qweewee · commit bd4b6cd23705 · 2025-08-20T17:51:08.000+08:00
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -29,6 +29,20 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32
 }
 
 static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, const int8_t * table) {
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
+    uint32_t v1, v2, v3, v4, mask;
+    const uint32_t * values = (const uint32_t *)table;
+
+    mask = (0x32103210 | ((q4 & 0x88888888) >> 1));
+    v1 = __byte_perm(values[0], values[1], q4);
+    v2 = __byte_perm(values[2], values[3], q4);
+    v3 = __byte_perm(v1, v2, mask);
+    v1 = __byte_perm(values[0], values[1], q4 >> 16);
+    v2 = __byte_perm(values[2], values[3], q4 >> 16);
+    v4 = __byte_perm(v1, v2, mask >> 16);
+
+    return make_int2(__byte_perm(v3, v4, 0x6420), __byte_perm(v3, v4, 0x7531));
+#else
     const int      q0_32  = (q4 >> 0) & 0x0F0F0F0F;
     const int8_t * q0_8   = (const int8_t *) &q0_32;
     const char4    val0_8 = make_char4(
@@ -40,6 +54,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
         table[q1_8[0]], table[q1_8[1]], table[q1_8[2]], table[q1_8[3]]);
 
     return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 }
 
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called