@@ -369,7 +369,7 @@ int __device__ __forceinline__ trellis_next_int(uint32_t& val) {
369369
370370float __device__ __forceinline__ trellis_next (uint32_t & val) {
371371 constexpr uint32_t ka = 3417055213 ;
372- constexpr uint32_t kb = 0 ;
372+ // constexpr uint32_t kb = 0;
373373 constexpr uint32_t kmask = 0x8fff8fff ;
374374 constexpr uint32_t km32 = 0x3b603b60 ;
375375 uint32_t s;
@@ -1056,6 +1056,48 @@ static __global__ void dequantize_block_iq3_ks_v1(const void * __restrict__ vx,
10561056 }
10571057}
10581058
1059+ template <typename dst_t >
1060+ static __global__ void dequantize_block_iq2_kl (const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
1061+
1062+ int64_t ii = blockIdx .x ;
1063+ int64_t row = (QK_K * ii) / n_per_row;
1064+ const char * cx = (const char *)vx + row * row_size;
1065+ float scale = (float )*(const ggml_half *)cx;
1066+ const block_iq2_kl * x = (const block_iq2_kl *)(cx + sizeof (ggml_half));
1067+ const int64_t i = ii - (row*n_per_row)/QK_K;
1068+
1069+ const int64_t tid = threadIdx .x ;
1070+ const int64_t ib64 = tid/8 ;
1071+ const int64_t il = tid%8 ;
1072+ dst_t * y = yy + ii*QK_K + 64 *ib64 + 4 *il;
1073+ const uint8_t * qs = x[i].qs + 16 *ib64 + 2 *il;
1074+ const uint8_t * qh = x[i].qh + 2 *il;
1075+ auto sh = x[i].scales_h >> 4 *ib64;
1076+ const float d1 = scale * (int (((x[i].scales_l [(2 *ib64+0 )%4 ] >> 4 *(ib64/2 )) & 0xf ) | ((sh << 4 ) & 0x30 )) - 32 );
1077+ const float d2 = scale * (int (((x[i].scales_l [(2 *ib64+1 )%4 ] >> 4 *(ib64/2 )) & 0xf ) | ((sh << 2 ) & 0x30 )) - 32 );
1078+ if constexpr (std::is_same_v<dst_t , nv_bfloat16>) {
1079+ for (int j = 0 ; j < 2 ; ++j) {
1080+ uint8_t h = qh[j] >> 2 *ib64;
1081+ auto val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf ) | ((h & 1 ) << 4 )));
1082+ auto val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4 ) | ((h & 2 ) << 3 )));
1083+ y[2 *j+ 0 ] = __float2bfloat16 (d1 * val1[0 ]);
1084+ y[2 *j+ 1 ] = __float2bfloat16 (d1 * val1[1 ]);
1085+ y[2 *j+32 ] = __float2bfloat16 (d2 * val2[0 ]);
1086+ y[2 *j+33 ] = __float2bfloat16 (d2 * val2[1 ]);
1087+ }
1088+ } else {
1089+ for (int j = 0 ; j < 2 ; ++j) {
1090+ uint8_t h = qh[j] >> 2 *ib64;
1091+ auto val1 = (const int8_t *)(iq2kl_values + ((qs[j] & 0xf ) | ((h & 1 ) << 4 )));
1092+ auto val2 = (const int8_t *)(iq2kl_values + ((qs[j] >> 4 ) | ((h & 2 ) << 3 )));
1093+ y[2 *j+ 0 ] = d1 * val1[0 ];
1094+ y[2 *j+ 1 ] = d1 * val1[1 ];
1095+ y[2 *j+32 ] = d2 * val2[0 ];
1096+ y[2 *j+33 ] = d2 * val2[1 ];
1097+ }
1098+ }
1099+ }
1100+
10591101template <typename dst_t >
10601102static __global__ void dequantize_block_iq3_ks (const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
10611103
@@ -1668,6 +1710,14 @@ static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t
16681710 dequantize_block_iq2_k<<<nb, 32 , 0 , stream>>> (vx, y);
16691711}
16701712
1713+ template <typename dst_t >
1714+ static void dequantize_row_iq2_kl_cuda (const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
1715+ const int64_t k = nrows * n_per_row;
1716+ const int64_t row_size = ggml_row_size (GGML_TYPE_IQ2_KL, n_per_row);
1717+ const int nb = (k + QK_K - 1 ) / QK_K;
1718+ dequantize_block_iq2_kl<<<nb, 32 , 0 , stream>>> (vx, y, n_per_row, row_size);
1719+ }
1720+
16711721template <typename dst_t >
16721722static void dequantize_row_iq3_ks_v1_cuda (const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
16731723 const int64_t k = nrows * n_per_row;
@@ -1869,6 +1919,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
18691919 return dequantize_row_iq2_ks_cuda<nv_bfloat16>;
18701920 case GGML_TYPE_IQ2_K:
18711921 return dequantize_row_iq2_k_cuda<nv_bfloat16>;
1922+ case GGML_TYPE_IQ2_KL:
1923+ return dequantize_row_iq2_kl_cuda<nv_bfloat16>;
18721924 case GGML_TYPE_IQ3_KS_V1:
18731925 return dequantize_row_iq3_ks_v1_cuda<nv_bfloat16>;
18741926 case GGML_TYPE_IQ3_KS:
@@ -1969,10 +2021,12 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
19692021 return dequantize_row_iq4_xs_cuda;
19702022 case GGML_TYPE_IQ2_KS:
19712023 return dequantize_row_iq2_ks_cuda;
1972- case GGML_TYPE_IQ3_KS_V1:
1973- return dequantize_row_iq3_ks_v1_cuda;
19742024 case GGML_TYPE_IQ2_K:
19752025 return dequantize_row_iq2_k_cuda;
2026+ case GGML_TYPE_IQ2_KL:
2027+ return dequantize_row_iq2_kl_cuda;
2028+ case GGML_TYPE_IQ3_KS_V1:
2029+ return dequantize_row_iq3_ks_v1_cuda;
19762030 case GGML_TYPE_IQ3_KS:
19772031 return dequantize_row_iq3_ks_cuda;
19782032 case GGML_TYPE_IQ3_K:
@@ -2081,6 +2135,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
20812135 return dequantize_row_iq2_k_cuda;
20822136 case GGML_TYPE_IQ3_K:
20832137 return dequantize_row_iq3_k_cuda;
2138+ case GGML_TYPE_IQ2_KL:
2139+ return dequantize_row_iq2_kl_cuda;
20842140 case GGML_TYPE_IQ3_KS:
20852141 return dequantize_row_iq3_ks_cuda;
20862142 case GGML_TYPE_IQ4_K:
0 commit comments