1414#define  GGML_SYCL_VECDOTQ_HPP 
1515
1616#include  " dpct/helper.hpp" 
17- #include  " syclcompat/math.hpp" 
1817
1918typedef  float  (*vec_dot_q_sycl_t )(const  void  * __restrict__ vbq, const  block_q8_1 * __restrict__ bq8_1, const  int  & iqs);
2019
@@ -90,14 +89,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
9089        const  int  vi = (v >> (2 *i)) & 0x03030303 ;
9190
9291        sumf_d +=
93-             d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); //  SIMD dot product
92+             d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); //  SIMD dot product
9493
9594        //  fill int with 4x m
9695        int  m = sc >> 4 ;
9796        m |= m <<  8 ;
9897        m |= m << 16 ;
9998        sumf_m += d8[i] *
100-                   syclcompat ::dp4a (
99+                   dpct ::dp4a (
101100                      m, u[i],
102101                      0 ); //  multiply constant q2_K part with sum of q8_1 values
103102    }
@@ -140,7 +139,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
140139        const  int  vi =
141140            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat ());
142141
143-         sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
142+         sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
144143    }
145144
146145    return  d3 * sumf;
@@ -163,11 +162,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
163162        const  int  v1i = (v[1 ] >> (4 *i)) & 0x0F0F0F0F ;
164163
165164        const  int  dot1 =
166-             syclcompat ::dp4a (v1i, u[2  * i + 1 ],
167-                        syclcompat ::dp4a (v0i, u[2  * i + 0 ], 0 )); //  SIMD dot product
165+             dpct ::dp4a (v1i, u[2  * i + 1 ],
166+                        dpct ::dp4a (v0i, u[2  * i + 0 ], 0 )); //  SIMD dot product
168167        const  int  dot2 =
169-             syclcompat ::dp4a (0x01010101 , u[2  * i + 1 ],
170-                        syclcompat ::dp4a (0x01010101 , u[2  * i + 0 ], 0 )); //  sum of u
168+             dpct ::dp4a (0x01010101 , u[2  * i + 1 ],
169+                        dpct ::dp4a (0x01010101 , u[2  * i + 0 ], 0 )); //  sum of u
171170
172171        sumf_d += d8[i] * (dot1 * sc[i]);
173172        sumf_m += d8[i] * (dot2 * m[i]);  //  multiply constant part of q4_K with sum of q8_1 values
@@ -204,11 +203,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
204203        const  int  v1i = vl1i | vh1i;
205204
206205        const  int  dot1 =
207-             syclcompat ::dp4a (v0i, u[2  * i + 0 ],
208-                        syclcompat ::dp4a (v1i, u[2  * i + 1 ], 0 )); //  SIMD dot product
206+             dpct ::dp4a (v0i, u[2  * i + 0 ],
207+                        dpct ::dp4a (v1i, u[2  * i + 1 ], 0 )); //  SIMD dot product
209208        const  int  dot2 =
210-             syclcompat ::dp4a (0x01010101 , u[2  * i + 0 ],
211-                        syclcompat ::dp4a (0x01010101 , u[2  * i + 1 ], 0 )); //  sum of u
209+             dpct ::dp4a (0x01010101 , u[2  * i + 0 ],
210+                        dpct ::dp4a (0x01010101 , u[2  * i + 1 ], 0 )); //  sum of u
212211
213212        sumf_d += d8[i] * (dot1 * sc[i]);
214213        sumf_m += d8[i] * (dot2 * m[i]);
@@ -244,7 +243,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
244243        const  int  vi = dpct::vectorized_binary<sycl::char4>(
245244            (vil | vih), 0x20202020 , dpct::sub_sat ()); //  vi = (vil | vih) - 32
246245
247-         sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
246+         sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); //  SIMD dot product
248247    }
249248
250249    return  d*sumf;
@@ -267,8 +266,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
267266        const  int  vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
268267
269268        //  SIMD dot product of quantized values
270-         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ], sumi);
271-         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ], sumi);
269+         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ], sumi);
270+         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ], sumi);
272271    }
273272
274273    const  sycl::float2 ds8f =
@@ -294,8 +293,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
294293        const  int  vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
295294
296295        //  SIMD dot product of quantized values
297-         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ], sumi);
298-         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ], sumi);
296+         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ], sumi);
297+         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ], sumi);
299298    }
300299
301300#ifdef  GGML_SYCL_F16
@@ -332,15 +331,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
332331        vi0    |= (vh[i] << 11 ) & 0x00001000 ; //  1 -> 12
333332        vi0    |= (vh[i] << 18 ) & 0x00100000 ; //  2 -> 20
334333        vi0    |= (vh[i] << 25 ) & 0x10000000 ; //  3 -> 28
335-         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ],
334+         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ],
336335                          sumi); //  SIMD dot product of quantized values
337336
338337        int  vi1 = (vl[i] >>  4 ) & 0x0F0F0F0F ; //  upper 4 qs bits, still need qh as 5th bits
339338        vi1    |= (vh[i] >> 12 ) & 0x00000010 ; //  16 ->  4
340339        vi1    |= (vh[i] >>  5 ) & 0x00001000 ; //  17 -> 12
341340        vi1    |= (vh[i] <<  2 ) & 0x00100000 ; //  18 -> 20
342341        vi1    |= (vh[i] <<  9 ) & 0x10000000 ; //  19 -> 28
343-         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ],
342+         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ],
344343                          sumi); //  SIMD dot product of quantized values
345344    }
346345
@@ -368,15 +367,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
368367        vi0    |= (vh[i] << 11 ) & 0x00001000 ; //  1 -> 12
369368        vi0    |= (vh[i] << 18 ) & 0x00100000 ; //  2 -> 20
370369        vi0    |= (vh[i] << 25 ) & 0x10000000 ; //  3 -> 28
371-         sumi = syclcompat ::dp4a (vi0, u[2  * i + 0 ],
370+         sumi = dpct ::dp4a (vi0, u[2  * i + 0 ],
372371                          sumi); //  SIMD dot product of quantized values
373372
374373        int  vi1 = (vl[i] >>  4 ) & 0x0F0F0F0F ; //  upper 4 qs bits, still need qh as 5th bits
375374        vi1    |= (vh[i] >> 12 ) & 0x00000010 ; //  16 ->  4
376375        vi1    |= (vh[i] >>  5 ) & 0x00001000 ; //  17 -> 12
377376        vi1    |= (vh[i] <<  2 ) & 0x00100000 ; //  18 -> 20
378377        vi1    |= (vh[i] <<  9 ) & 0x10000000 ; //  19 -> 28
379-         sumi = syclcompat ::dp4a (vi1, u[2  * i + 1 ],
378+         sumi = dpct ::dp4a (vi1, u[2  * i + 1 ],
380379                          sumi); //  SIMD dot product of quantized values
381380    }
382381
@@ -413,7 +412,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
413412#pragma  unroll
414413    for  (int  i = 0 ; i < vdr; ++i) {
415414        //  SIMD dot product of quantized values
416-         sumi = syclcompat ::dp4a (v[i], u[i], sumi);
415+         sumi = dpct ::dp4a (v[i], u[i], sumi);
417416    }
418417
419418    return  d8_0*d8_1 * sumi;
@@ -429,7 +428,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
429428#pragma  unroll
430429    for  (int  i = 0 ; i < vdr; ++i) {
431430        //  SIMD dot product of quantized values
432-         sumi = syclcompat ::dp4a (v[i], u[i], sumi);
431+         sumi = dpct ::dp4a (v[i], u[i], sumi);
433432    }
434433
435434#ifdef  GGML_SYCL_F16
@@ -678,10 +677,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
678677    const  int  v1 = q4[0 ];
679678    const  int  v2 = q4[4 ];
680679
681-     const  int  dot1 = syclcompat ::dp4a (ui2, v2 & 0x0f0f0f0f , syclcompat ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
682-     const  int  dot2 = syclcompat ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , syclcompat ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
683-     const  int  dot3 = syclcompat ::dp4a (0x01010101 , ui2, syclcompat ::dp4a (0x01010101 , ui1, 0 ));
684-     const  int  dot4 = syclcompat ::dp4a (0x01010101 , ui4, syclcompat ::dp4a (0x01010101 , ui3, 0 ));
680+     const  int  dot1 = dpct ::dp4a (ui2, v2 & 0x0f0f0f0f , dpct ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
681+     const  int  dot2 = dpct ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , dpct ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
682+     const  int  dot3 = dpct ::dp4a (0x01010101 , ui2, dpct ::dp4a (0x01010101 , ui1, 0 ));
683+     const  int  dot4 = dpct ::dp4a (0x01010101 , ui4, dpct ::dp4a (0x01010101 , ui3, 0 ));
685684
686685    sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
687686    sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
@@ -773,8 +772,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
773772    const  int  v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
774773    const  int  v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
775774
776-     const  float  sumf_d = d8_1 * (syclcompat ::dp4a (ui1, v1, 0 ) * s[0 ] + syclcompat ::dp4a (ui2, v2, 0 ) * s[1 ])
777-                        + d8_2 * (syclcompat ::dp4a (ui3, v3, 0 ) * s[2 ] + syclcompat ::dp4a (ui4, v4, 0 ) * s[3 ]);
775+     const  float  sumf_d = d8_1 * (dpct ::dp4a (ui1, v1, 0 ) * s[0 ] + dpct ::dp4a (ui2, v2, 0 ) * s[1 ])
776+                        + d8_2 * (dpct ::dp4a (ui3, v3, 0 ) * s[2 ] + dpct ::dp4a (ui4, v4, 0 ) * s[3 ]);
778777
779778    return  d * sumf_d;
780779
@@ -866,8 +865,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
866865            grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
867866        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
868867            grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
869-         sumi1 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
870-         sumi1 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
868+         sumi1 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
869+         sumi1 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
871870        q8 += 8 ;
872871    }
873872    int  sumi2 = 0 ;
@@ -878,8 +877,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
878877            grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
879878        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
880879            grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
881-         sumi2 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
882-         sumi2 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
880+         sumi2 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
881+         sumi2 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
883882        q8 += 8 ;
884883    }
885884    const  float  d = (float )bq2->d  * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -918,8 +917,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
918917            grid[0 ] ^ signs0, signs0, std::minus<>());
919918        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
920919            grid[1 ] ^ signs1, signs1, std::minus<>());
921-         sumi1 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
922-         sumi1 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
920+         sumi1 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi1);
921+         sumi1 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi1);
923922        q8 += 8 ;
924923    }
925924    int  sumi2 = 0 ;
@@ -935,8 +934,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
935934            grid[0 ] ^ signs0, signs0, std::minus<>());
936935        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
937936            grid[1 ] ^ signs1, signs1, std::minus<>());
938-         sumi2 = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
939-         sumi2 = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
937+         sumi2 = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi2);
938+         sumi2 = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi2);
940939        q8 += 8 ;
941940    }
942941    const  float  d = (float )bq2->d  * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -969,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
969968            grid1[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
970969        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
971970            grid2[0 ] ^ signs[1 ], signs[1 ], std::minus<>());
972-         sumi = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi);
973-         sumi = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi);
971+         sumi = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi);
972+         sumi = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi);
974973        q8 += 8 ;
975974        aux32 >>= 7 ;
976975    }
@@ -1010,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
10101009            grid1[0 ] ^ signs0, signs0, std::minus<>());
10111010        const  int  grid_h = dpct::vectorized_binary<sycl::uchar4>(
10121011            grid2[0 ] ^ signs1, signs1, std::minus<>());
1013-         sumi = syclcompat ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi);
1014-         sumi = syclcompat ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi);
1012+         sumi = dpct ::dp4a (grid_l, *((const  int  *)q8 + 0 ), sumi);
1013+         sumi = dpct ::dp4a (grid_h, *((const  int  *)q8 + 1 ), sumi);
10151014        q8 += 8 ;
10161015    }
10171016    const  float  d =
@@ -1038,8 +1037,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
10381037        const  int  * grid = (const  int  *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [ib32] >> 3 *l) & 7 ) << 8 )));
10391038        int  grid0 = grid[0 ] & 0x0f0f0f0f ;
10401039        int  grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1041-         sumi = syclcompat ::dp4a (q8[2  * l + 1 ], grid1,
1042-                           syclcompat ::dp4a (q8[2  * l + 0 ], grid0, sumi));
1040+         sumi = dpct ::dp4a (q8[2  * l + 1 ], grid1,
1041+                           dpct ::dp4a (q8[2  * l + 0 ], grid0, sumi));
10431042    }
10441043
10451044    const  float  delta = bq1->qh [ib32] & 0x8000  ? -1 -IQ1S_DELTA : -1 +IQ1S_DELTA;
@@ -1067,11 +1066,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
10671066        const  int  * grid = (const  int  *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 7 ) << 8 )));
10681067        int  grid0 = grid[0 ] & 0x0f0f0f0f ;
10691068        int  grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1070-         sumi[l / 2 ] = syclcompat ::dp4a (q8[2  * l + 1 ], grid1,
1071-                                  syclcompat ::dp4a (q8[2  * l + 0 ], grid0, sumi[l / 2 ]));
1069+         sumi[l / 2 ] = dpct ::dp4a (q8[2  * l + 1 ], grid1,
1070+                                  dpct ::dp4a (q8[2  * l + 0 ], grid0, sumi[l / 2 ]));
10721071        const  float  delta = (bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 0x08  ? -1 -IQ1M_DELTA : -1 +IQ1M_DELTA;
1073-         const  int  sumy = syclcompat ::dp4a (q8[2  * l + 1 ], 0x01010101 ,
1074-                                     syclcompat ::dp4a (q8[2  * l + 0 ], 0x01010101 , 0 ));
1072+         const  int  sumy = dpct ::dp4a (q8[2  * l + 1 ], 0x01010101 ,
1073+                                     dpct ::dp4a (q8[2  * l + 0 ], 0x01010101 , 0 ));
10751074        sumf[l/2 ] += delta*sumy;
10761075    }
10771076
@@ -1102,8 +1101,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
11021101    for  (int  l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
11031102        const  uint32_t  aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
11041103        get_int_from_table_16 (aux, values, v1, v2);
1105-         sumi1 = syclcompat ::dp4a (v1, q8[l + 0 ], sumi1);
1106-         sumi2 = syclcompat ::dp4a (v2, q8[l + 4 ], sumi2);
1104+         sumi1 = dpct ::dp4a (v1, q8[l + 0 ], sumi1);
1105+         sumi2 = dpct ::dp4a (v2, q8[l + 4 ], sumi2);
11071106    }
11081107
11091108    const  float  d = (float )bq->d  * bq8_1->ds [0 ];
@@ -1129,8 +1128,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
11291128    int  sumi1 = 0 , sumi2 = 0 ;
11301129    for  (int  j = 0 ; j < 4 ; ++j) {
11311130        get_int_from_table_16 (q4[j], values, v1, v2);
1132-         sumi1 = syclcompat ::dp4a (v1, q8[j + 0 ], sumi1);
1133-         sumi2 = syclcompat ::dp4a (v2, q8[j + 4 ], sumi2);
1131+         sumi1 = dpct ::dp4a (v1, q8[j + 0 ], sumi1);
1132+         sumi2 = dpct ::dp4a (v2, q8[j + 4 ], sumi2);
11341133    }
11351134    return  d * (sumi1 + sumi2);
11361135#else 
0 commit comments