1414#define  GGML_SYCL_VECDOTQ_HPP 
1515
1616#include  " dpct/helper.hpp" 
17+ #include  " syclcompat/math.hpp" 
1718
1819typedef  float  (*vec_dot_q_sycl_t )(const  void  * __restrict__ vbq, const  block_q8_1 * __restrict__ bq8_1, const  int  & iqs);
1920
@@ -89,14 +90,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
8990        const  int  vi = (v >> (2 *i)) & 0x03030303 ;
9091
9192        sumf_d +=
92-             d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); //  SIMD dot product
93+             d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); //  SIMD dot product
9394
9495        //  fill int with 4x m
9596        int  m = sc >> 4 ;
9697        m |= m <<  8 ;
9798        m |= m << 16 ;
9899        sumf_m += d8[i] *
99-                   dpct ::dp4a (
100+                   syclcompat ::dp4a (
100101                      m, u[i],
101102                      0 ); //  multiply constant q2_K part with sum of q8_1 values
102103    }
@@ -139,7 +140,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
139140        const  int  vi =
140141            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat ());
141142
142-         sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
143+         sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
143144    }
144145
145146    return  d3 * sumf;
@@ -162,11 +163,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
162163        const  int  v1i = (v[1 ] >> (4 *i)) & 0x0F0F0F0F ;
163164
164165        const  int  dot1 =
165-             dpct ::dp4a (v1i, u[2  * i + 1 ],
166-                        dpct ::dp4a (v0i, u[2  * i + 0 ], 0 )); //  SIMD dot product
166+             syclcompat ::dp4a (v1i, u[2  * i + 1 ],
167+                        syclcompat ::dp4a (v0i, u[2  * i + 0 ], 0 )); //  SIMD dot product
167168        const  int  dot2 =
168-             dpct ::dp4a (0x01010101 , u[2  * i + 1 ],
169-                        dpct ::dp4a (0x01010101 , u[2  * i + 0 ], 0 )); //  sum of u
169+             syclcompat ::dp4a (0x01010101 , u[2  * i + 1 ],
170+                        syclcompat ::dp4a (0x01010101 , u[2  * i + 0 ], 0 )); //  sum of u
170171
171172        sumf_d += d8[i] * (dot1 * sc[i]);
172173        sumf_m += d8[i] * (dot2 * m[i]);  //  multiply constant part of q4_K with sum of q8_1 values
@@ -203,11 +204,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
203204        const  int  v1i = vl1i | vh1i;
204205
205206        const  int  dot1 =
206-             dpct ::dp4a (v0i, u[2  * i + 0 ],
207-                        dpct ::dp4a (v1i, u[2  * i + 1 ], 0 )); //  SIMD dot product
207+             syclcompat ::dp4a (v0i, u[2  * i + 0 ],
208+                        syclcompat ::dp4a (v1i, u[2  * i + 1 ], 0 )); //  SIMD dot product
208209        const  int  dot2 =
209-             dpct ::dp4a (0x01010101 , u[2  * i + 0 ],
210-                        dpct ::dp4a (0x01010101 , u[2  * i + 1 ], 0 )); //  sum of u
210+             syclcompat ::dp4a (0x01010101 , u[2  * i + 0 ],
211+                        syclcompat ::dp4a (0x01010101 , u[2  * i + 1 ], 0 )); //  sum of u
211212
212213        sumf_d += d8[i] * (dot1 * sc[i]);
213214        sumf_m += d8[i] * (dot2 * m[i]);
@@ -243,7 +244,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
243244        const  int  vi = dpct::vectorized_binary<sycl::char4>(
244245            (vil | vih), 0x20202020 , dpct::sub_sat ()); //  vi = (vil | vih) - 32
245246
246-         sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
247+         sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
247248    }
248249
249250    return  d*sumf;
@@ -266,8 +267,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
266267        const  int  vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
267268
268269        //  SIMD dot product of quantized values
269-         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ], sumi);
270-         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ], sumi);
270+         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ], sumi);
271+         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ], sumi);
271272    }
272273
273274    const  sycl::float2 ds8f =
@@ -293,8 +294,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
293294        const  int  vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
294295
295296        //  SIMD dot product of quantized values
296-         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ], sumi);
297-         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ], sumi);
297+         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ], sumi);
298+         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ], sumi);
298299    }
299300
300301#ifdef  GGML_SYCL_F16
@@ -331,15 +332,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
331332        vi0    |= (vh[i] << 11 ) & 0x00001000 ; //  1 -> 12
332333        vi0    |= (vh[i] << 18 ) & 0x00100000 ; //  2 -> 20
333334        vi0    |= (vh[i] << 25 ) & 0x10000000 ; //  3 -> 28
334-         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ],
335+         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ],
335336                          sumi); //  SIMD dot product of quantized values
336337
337338        int  vi1 = (vl[i] >>  4 ) & 0x0F0F0F0F ; //  upper 4 qs bits, still need qh as 5th bits
338339        vi1    |= (vh[i] >> 12 ) & 0x00000010 ; //  16 ->  4
339340        vi1    |= (vh[i] >>  5 ) & 0x00001000 ; //  17 -> 12
340341        vi1    |= (vh[i] <<  2 ) & 0x00100000 ; //  18 -> 20
341342        vi1    |= (vh[i] <<  9 ) & 0x10000000 ; //  19 -> 28
342-         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ],
343+         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ],
343344                          sumi); //  SIMD dot product of quantized values
344345    }
345346
@@ -367,15 +368,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
367368        vi0    |= (vh[i] << 11 ) & 0x00001000 ; //  1 -> 12
368369        vi0    |= (vh[i] << 18 ) & 0x00100000 ; //  2 -> 20
369370        vi0    |= (vh[i] << 25 ) & 0x10000000 ; //  3 -> 28
370-         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ],
371+         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ],
371372                          sumi); //  SIMD dot product of quantized values
372373
373374        int  vi1 = (vl[i] >>  4 ) & 0x0F0F0F0F ; //  upper 4 qs bits, still need qh as 5th bits
374375        vi1    |= (vh[i] >> 12 ) & 0x00000010 ; //  16 ->  4
375376        vi1    |= (vh[i] >>  5 ) & 0x00001000 ; //  17 -> 12
376377        vi1    |= (vh[i] <<  2 ) & 0x00100000 ; //  18 -> 20
377378        vi1    |= (vh[i] <<  9 ) & 0x10000000 ; //  19 -> 28
378-         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ],
379+         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ],
379380                          sumi); //  SIMD dot product of quantized values
380381    }
381382
@@ -412,7 +413,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
412413#pragma  unroll
413414    for  (int  i = 0 ; i < vdr; ++i) {
414415        //  SIMD dot product of quantized values
415-         sumi = dpct ::dp4a (v[i], u[i], sumi);
416+         sumi = syclcompat ::dp4a (v[i], u[i], sumi);
416417    }
417418
418419    return  d8_0*d8_1 * sumi;
@@ -428,7 +429,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
428429#pragma  unroll
429430    for  (int  i = 0 ; i < vdr; ++i) {
430431        //  SIMD dot product of quantized values
431-         sumi = dpct ::dp4a (v[i], u[i], sumi);
432+         sumi = syclcompat ::dp4a (v[i], u[i], sumi);
432433    }
433434
434435#ifdef  GGML_SYCL_F16
@@ -677,10 +678,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
677678    const  int  v1 = q4[0 ];
678679    const  int  v2 = q4[4 ];
679680
680-     const  int  dot1 = dpct ::dp4a (ui2, v2 & 0x0f0f0f0f , dpct ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
681-     const  int  dot2 = dpct ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , dpct ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
682-     const  int  dot3 = dpct ::dp4a (0x01010101 , ui2, dpct ::dp4a (0x01010101 , ui1, 0 ));
683-     const  int  dot4 = dpct ::dp4a (0x01010101 , ui4, dpct ::dp4a (0x01010101 , ui3, 0 ));
681+     const  int  dot1 = syclcompat ::dp4a (ui2, v2 & 0x0f0f0f0f , syclcompat ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
682+     const  int  dot2 = syclcompat ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , syclcompat ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
683+     const  int  dot3 = syclcompat ::dp4a (0x01010101 , ui2, syclcompat ::dp4a (0x01010101 , ui1, 0 ));
684+     const  int  dot4 = syclcompat ::dp4a (0x01010101 , ui4, syclcompat ::dp4a (0x01010101 , ui3, 0 ));
684685
685686    sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
686687    sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
@@ -772,8 +773,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
772773    const  int  v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
773774    const  int  v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
774775
775-     const  float  sumf_d = d8_1 * (dpct ::dp4a (ui1, v1, 0 ) * s[0 ] + dpct ::dp4a (ui2, v2, 0 ) * s[1 ])
776-                        + d8_2 * (dpct ::dp4a (ui3, v3, 0 ) * s[2 ] + dpct ::dp4a (ui4, v4, 0 ) * s[3 ]);
776+     const  float  sumf_d = d8_1 * (syclcompat ::dp4a (ui1, v1, 0 ) * s[0 ] + syclcompat ::dp4a (ui2, v2, 0 ) * s[1 ])
777+                        + d8_2 * (syclcompat ::dp4a (ui3, v3, 0 ) * s[2 ] + syclcompat ::dp4a (ui4, v4, 0 ) * s[3 ]);
777778
778779    return  d * sumf_d;
779780
@@ -865,8 +866,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
865866            grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
866867        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
867868            grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
868-         sumi1 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
869-         sumi1 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
869+         sumi1 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
870+         sumi1 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
870871        q8 += 8 ;
871872    }
872873    int  sumi2 = 0 ;
@@ -877,8 +878,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
877878            grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
878879        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
879880            grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
880-         sumi2 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
881-         sumi2 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
881+         sumi2 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
882+         sumi2 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
882883        q8 += 8 ;
883884    }
884885    const  float  d = (float )bq2->d  * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -917,8 +918,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
917918            grid[0 ] ^ signs0, signs0, std::minus<>());
918919        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
919920            grid[1 ] ^ signs1, signs1, std::minus<>());
920-         sumi1 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
921-         sumi1 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
921+         sumi1 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
922+         sumi1 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
922923        q8 += 8 ;
923924    }
924925    int  sumi2 = 0 ;
@@ -934,8 +935,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
934935            grid[0 ] ^ signs0, signs0, std::minus<>());
935936        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
936937            grid[1 ] ^ signs1, signs1, std::minus<>());
937-         sumi2 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
938-         sumi2 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
938+         sumi2 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
939+         sumi2 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
939940        q8 += 8 ;
940941    }
941942    const  float  d = (float )bq2->d  * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -968,8 +969,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
968969            grid1[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
969970        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
970971            grid2[0 ] ^ signs[1 ], signs[1 ], std::minus<>());
971-         sumi = dpct ::dp4a (grid_l, *((int  *)q8 + 0 ), sumi);
972-         sumi = dpct ::dp4a (grid_h, *((int  *)q8 + 1 ), sumi);
972+         sumi = syclcompat ::dp4a (grid_l, *((const   int  *)q8 + 0 ), sumi);
973+         sumi = syclcompat ::dp4a (grid_h, *((const   int  *)q8 + 1 ), sumi);
973974        q8 += 8 ;
974975        aux32 >>= 7 ;
975976    }
@@ -1009,8 +1010,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
10091010            grid1[0 ] ^ signs0, signs0, std::minus<>());
10101011        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
10111012            grid2[0 ] ^ signs1, signs1, std::minus<>());
1012-         sumi = dpct ::dp4a (grid_l, *((int  *)q8 + 0 ), sumi);
1013-         sumi = dpct ::dp4a (grid_h, *((int  *)q8 + 1 ), sumi);
1013+         sumi = syclcompat ::dp4a (grid_l, *((const   int  *)q8 + 0 ), sumi);
1014+         sumi = syclcompat ::dp4a (grid_h, *((const   int  *)q8 + 1 ), sumi);
10141015        q8 += 8 ;
10151016    }
10161017    const  float  d =
@@ -1037,8 +1038,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
10371038        const  int  * grid = (const  int  *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [ib32] >> 3 *l) & 7 ) << 8 )));
10381039        int  grid0 = grid[0 ] & 0x0f0f0f0f ;
10391040        int  grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1040-         sumi = dpct ::dp4a (q8[2  * l + 1 ], grid1,
1041-                           dpct ::dp4a (q8[2  * l + 0 ], grid0, sumi));
1041+         sumi = syclcompat ::dp4a (q8[2  * l + 1 ], grid1,
1042+                           syclcompat ::dp4a (q8[2  * l + 0 ], grid0, sumi));
10421043    }
10431044
10441045    const  float  delta = bq1->qh [ib32] & 0x8000  ? -1 -IQ1S_DELTA : -1 +IQ1S_DELTA;
@@ -1066,11 +1067,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
10661067        const  int  * grid = (const  int  *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 7 ) << 8 )));
10671068        int  grid0 = grid[0 ] & 0x0f0f0f0f ;
10681069        int  grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1069-         sumi[l / 2 ] = dpct ::dp4a (q8[2  * l + 1 ], grid1,
1070-                                  dpct ::dp4a (q8[2  * l + 0 ], grid0, sumi[l / 2 ]));
1070+         sumi[l / 2 ] = syclcompat ::dp4a (q8[2  * l + 1 ], grid1,
1071+                                  syclcompat ::dp4a (q8[2  * l + 0 ], grid0, sumi[l / 2 ]));
10711072        const  float  delta = (bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 0x08  ? -1 -IQ1M_DELTA : -1 +IQ1M_DELTA;
1072-         const  int  sumy = dpct ::dp4a (q8[2  * l + 1 ], 0x01010101 ,
1073-                                     dpct ::dp4a (q8[2  * l + 0 ], 0x01010101 , 0 ));
1073+         const  int  sumy = syclcompat ::dp4a (q8[2  * l + 1 ], 0x01010101 ,
1074+                                     syclcompat ::dp4a (q8[2  * l + 0 ], 0x01010101 , 0 ));
10741075        sumf[l/2 ] += delta*sumy;
10751076    }
10761077
@@ -1101,8 +1102,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
11011102    for  (int  l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
11021103        const  uint32_t  aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
11031104        get_int_from_table_16 (aux, values, v1, v2);
1104-         sumi1 = dpct ::dp4a (v1, q8[l + 0 ], sumi1);
1105-         sumi2 = dpct ::dp4a (v2, q8[l + 4 ], sumi2);
1105+         sumi1 = syclcompat ::dp4a (v1, q8[l + 0 ], sumi1);
1106+         sumi2 = syclcompat ::dp4a (v2, q8[l + 4 ], sumi2);
11061107    }
11071108
11081109    const  float  d = (float )bq->d  * bq8_1->ds [0 ];
@@ -1128,8 +1129,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
11281129    int  sumi1 = 0 , sumi2 = 0 ;
11291130    for  (int  j = 0 ; j < 4 ; ++j) {
11301131        get_int_from_table_16 (q4[j], values, v1, v2);
1131-         sumi1 = dpct ::dp4a (v1, q8[j + 0 ], sumi1);
1132-         sumi2 = dpct ::dp4a (v2, q8[j + 4 ], sumi2);
1132+         sumi1 = syclcompat ::dp4a (v1, q8[j + 0 ], sumi1);
1133+         sumi2 = syclcompat ::dp4a (v2, q8[j + 4 ], sumi2);
11331134    }
11341135    return  d * (sumi1 + sumi2);
11351136#else 
0 commit comments