33#include " dequantize.hpp"
44#include " presets.hpp"
55
6-
76static void convert_f16 (const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
87 const sycl::half *x = (const sycl::half *)vx;
98
@@ -91,7 +90,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
9190 }
9291}
9392
94- template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder >
93+ template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder >
9594static void dequantize_mul_mat_vec_reorder (const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
9695 const sycl::nd_item<3 > &item_ct1) {
9796 // qk = quantized weights per x block
@@ -134,7 +133,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
134133 // dequantize
135134 // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
136135 dfloat2 v;
137- dequantize_kernel_recorder ((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
136+ dequantize_kernel_reorder ((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
138137
139138 // matrix multiplication
140139 // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -165,7 +164,7 @@ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const d
165164 // dequantize
166165 // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
167166 dfloat2 v;
168- dequantize_kernel_recorder ((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
167+ dequantize_kernel_reorder ((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
169168
170169 // matrix multiplication
171170 // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
@@ -865,7 +864,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
865864 }
866865}
867866
868-
869867static void dequantize_mul_mat_vec_q4_0_sycl_reorder (const void *vx, const dfloat *y,
870868 float *dst, const int ncols,
871869 const int nrows,
@@ -1082,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10821080
10831081 const int64_t ne00 = src0->ne [0 ];
10841082 const int64_t row_diff = row_high - row_low;
1085-
10861083 GGML_ASSERT (src1->type == GGML_TYPE_F32);
10871084 // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
10881085#ifdef GGML_SYCL_F16
@@ -1096,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10961093
10971094 if (src1_convert_f16) {
10981095 src1_dfloat = src1_dfloat_a.alloc (ne00);
1099- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl (src1->type , ctx );
1096+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl (src1->type , dst );
11001097 GGML_ASSERT (to_fp16_sycl != nullptr );
11011098 to_fp16_sycl (src1_ddf_i, src1_dfloat, ne00, stream);
11021099 }
@@ -1106,7 +1103,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
11061103
11071104 switch (src0->type ) {
11081105 case GGML_TYPE_Q4_0:
1109- if (ctx.opt_feature .reorder ) {
1106+ if ((ggml_tensor_extra_gpu*)dst->src [0 ]->extra &&
1107+ ((ggml_tensor_extra_gpu*)dst->src [0 ]->extra )->optimized_feature .reorder ) {
11101108 dequantize_mul_mat_vec_q4_0_sycl_reorder (src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
11111109 } else {
11121110 dequantize_mul_mat_vec_q4_0_sycl (src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
0 commit comments