@@ -254,8 +254,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
254254 GGML_ASSERT (ncols % WARP_SIZE == 0 );
255255 if (ncols < 1024 ) {
256256 const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
257- stream-> submit ([&](sycl::handler& cgh) {
258- cgh. parallel_for (
257+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
258+ syclex::nd_launch (cgh,
259259 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
260260 [=](sycl::nd_item<3 > item_ct1)
261261 [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -272,10 +272,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
272272 the limit. To get the device limit, query
273273 info::device::max_work_group_size. Adjust the work-group size if needed.
274274 */
275- stream-> submit ([&](sycl::handler& cgh) {
275+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
276276 sycl::local_accessor<sycl::float2, 1 > s_sum_acc_ct1 (
277277 sycl::range<1 >(work_group_size / WARP_SIZE), cgh);
278- cgh. parallel_for (
278+ syclex::nd_launch (cgh,
279279 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
280280 [=](sycl::nd_item<3 > item_ct1)
281281 [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -290,9 +290,9 @@ static void group_norm_f32_sycl(const float* x, float* dst,
290290 const int ne_elements, queue_ptr stream, int device) {
291291 if (group_size < 1024 ) {
292292 const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
293- stream-> submit ([&](sycl::handler& cgh) {
293+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
294294 const float eps_ct4 = eps;
295- cgh. parallel_for (
295+ syclex::nd_launch (cgh,
296296 sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , num_groups) * block_dims,
297297 block_dims),
298298 [=](sycl::nd_item<3 > item_ct1)
@@ -313,13 +313,13 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313313 info::device::max_work_group_size. Adjust the work-group size if needed.
314314 */
315315
316- stream-> submit ([&](sycl::handler& cgh) {
316+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
317317 sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
318318 cgh);
319319
320320 const float eps_ct4 = eps;
321321
322- cgh. parallel_for (
322+ syclex::nd_launch (cgh,
323323 sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , num_groups) * block_dims,
324324 block_dims),
325325 [=](sycl::nd_item<3 > item_ct1)
@@ -340,8 +340,8 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
340340 const sycl::range<3 > global_dims (nsamples, nchannels, nrows);
341341 if (ncols < 1024 ) {
342342 const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
343- stream-> submit ([&](sycl::handler& cgh) {
344- cgh. parallel_for (
343+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
344+ syclex::nd_launch (cgh,
345345 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
346346 [=](sycl::nd_item<3 > item_ct1)
347347 [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -358,10 +358,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
358358 the limit. To get the device limit, query
359359 info::device::max_work_group_size. Adjust the work-group size if needed.
360360 */
361- stream-> submit ([&](sycl::handler& cgh) {
361+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
362362 sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
363363 cgh);
364- cgh. parallel_for (
364+ syclex::nd_launch (cgh,
365365 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
366366 [=](sycl::nd_item<3 > item_ct1)
367367 [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -378,8 +378,8 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
378378 // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
379379 if (ncols < 1024 ) {
380380 const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
381- stream-> submit ([&](sycl::handler& cgh) {
382- cgh. parallel_for (
381+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
382+ syclex::nd_launch (cgh,
383383 sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , nrows) * block_dims,
384384 block_dims),
385385 [=](sycl::nd_item<3 > item_ct1)
@@ -398,10 +398,10 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398398 the limit. To get the device limit, query
399399 info::device::max_work_group_size. Adjust the work-group size if needed.
400400 */
401- stream-> submit ([&](sycl::handler& cgh) {
401+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
402402 sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
403403 cgh);
404- cgh. parallel_for (
404+ syclex::nd_launch (cgh,
405405 sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , nrows) * block_dims,
406406 block_dims),
407407 [=](sycl::nd_item<3 > item_ct1)
0 commit comments