@@ -16,7 +16,8 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
1616 x += sample * stride_sample + channel * stride_channel + row * stride_row;
1717 dst += ((sample * nchannels + channel) * nrows + row) * ncols;
1818
19- sycl::float2 mean_var{0 .f , 0 .f };
19+ sycl::float2 mean_var = sycl::float2 (0 .f , 0 .f );
20+
2021 for (int col = tid; col < ncols; col += block_size) {
2122 const float xi = x[col];
2223 mean_var.x () += xi;
@@ -42,7 +43,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
4243 }
4344
4445 const float mean = mean_var.x () / ncols;
45- const float var = mean_var.y () / ncols - mean * mean;
46+ const float var = mean_var.y () / ncols - mean * mean;
4647 const float inv_std = sycl::rsqrt (var + eps);
4748
4849 for (int col = tid; col < ncols; col += block_size) {
@@ -240,7 +241,7 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
240241 const sycl::range<3 > global_dims (nsamples, nchannels, nrows);
241242 GGML_ASSERT (ncols % WARP_SIZE == 0 );
242243 if (ncols < 1024 ) {
243- const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE); // Equivalent to CUDA's (WARP_SIZE, 1, 1)
244+ const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
244245 stream->submit ([&](sycl::handler& cgh) {
245246 cgh.parallel_for (
246247 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
@@ -260,8 +261,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
260261 info::device::max_work_group_size. Adjust the work-group size if needed.
261262 */
262263 stream->submit ([&](sycl::handler& cgh) {
263- auto s_sum_acc_ct1 = sycl::local_accessor<sycl::float2, 1 >(sycl::range< 1 >(work_group_size / WARP_SIZE), cgh);
264-
264+ sycl::local_accessor<sycl::float2, 1 > s_sum_acc_ct1 (
265+ sycl::range< 1 >(work_group_size / WARP_SIZE), cgh);
265266 cgh.parallel_for (
266267 sycl::nd_range<3 >(global_dims * block_dims, block_dims),
267268 [=](sycl::nd_item<3 > item_ct1)
0 commit comments