From ac60debed9e1750a80d3da209337a6d8aa3bbc11 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:08:17 +0800 Subject: [PATCH 1/6] Update DilatedMaxPool2d.cpp --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 295 +++++++++++++++++- 1 file changed, 292 insertions(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index d94db11c93..b2321ab9fc 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -251,30 +252,33 @@ struct MaxPool2dBackwardDeterministicKernelFunctor { int pwstart = p_start(inputW, pad_w_, kernel_w_, dilation_w_, stride_w_); int pwend = p_end(inputW, pad_w_, gradOutputSizeW_, stride_w_); + scalar_t grad = 0; if constexpr (is_channels_last) { int offset = batch * out_n_stride_ + plane; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { if (indices_[offset + (ph * gradOutputSizeW_ + pw) * numPlane_] == input_hw_index) { - gradInput_[inputIndex] += static_cast( + grad += static_cast( gradOutput_ [offset + (ph * gradOutputSizeW_ + pw) * numPlane_]); } } } - } else { + } + else { int offset = batch * out_n_stride_ + plane * out_cf_c_stride_; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { if (indices_[offset + ph * gradOutputSizeW_ + pw] == input_hw_index) { - gradInput_[inputIndex] += static_cast( + grad += static_cast( gradOutput_[offset + ph * gradOutputSizeW_ + pw]); } } } } + gradInput_[inputIndex] = grad; } } while (cfg_.next(item, desc)); } @@ -349,6 +353,122 @@ struct MaxPool2dBackwardDeterministicKernelFunctor { BatchKernelConfig cfg_; }; +template +struct MaxPool2dBackwardChannelLastVec { + void operator()(sycl::nd_item<1> item) const { + for (auto inputIndex = item.get_global_linear_id(); + inputIndex < gradInputSize_ / vec_size; + inputIndex += item.get_local_range(0) * item.get_group_range(0)) { + int batch = inputIndex / (in_n_stride_ / vec_size); + int plane; + int64_t input_hw_index; + + plane = inputIndex % (numPlane_ / vec_size); + input_hw_index = + ((inputIndex % in_n_stride_) - plane) / (numPlane_ / vec_size); + + int inputW = input_hw_index % gradInputSizeW_; + int inputH = input_hw_index / gradInputSizeW_; + int phstart = p_start(inputH, pad_h_, kernel_h_, dilation_h_, stride_h_); + int phend = p_end(inputH, pad_h_, gradOutputSizeH_, stride_h_); + int pwstart = p_start(inputW, pad_w_, kernel_w_, dilation_w_, stride_w_); + int pwend = p_end(inputW, pad_w_, gradOutputSizeW_, stride_w_); + scalar_t grad = 0; + int64_t load_offset, store_offset; + store_offset = inputIndex; + vec_t grad_vec; +#pragma unroll + for (int i = 0; i < vec_size; i++) { + grad_vec[i] = 0; + } + + int offset = batch * (out_n_stride_ / vec_size) + plane; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + load_offset = + offset + (ph * gradOutputSizeW_ + pw) * (numPlane_ / vec_size); + vec_t gout_val_vec = gradOutput_[load_offset]; +#pragma unroll + for (int i = 0; i < vec_size; i++) { + if (indices_[load_offset * vec_size + i] == input_hw_index) { + grad_vec[i] = static_cast(grad_vec[i]) + + static_cast(gout_val_vec[i]); + } + } + } + } + + gradInput_[store_offset] = grad_vec; + } + } + MaxPool2dBackwardChannelLastVec( + vec_t* gradInput, + const vec_t* gradOutput, + const int64_t* indices, + int numPlane, + int gradInputSizeH, + int gradInputSizeW, + int gradOutputSizeH, + int gradOutputSizeW, + int64_t gradInputSize, + int out_cf_c_stride, + int in_cf_c_stride, + int out_n_stride, + int in_n_stride, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_h, + int pad_w, + int dilation_h, + int dilation_w) + : gradInput_(gradInput), + gradOutput_(gradOutput), + indices_(indices), + numPlane_(numPlane), + gradInputSizeH_(gradInputSizeH), + gradInputSizeW_(gradInputSizeW), + gradOutputSizeH_(gradOutputSizeH), + gradOutputSizeW_(gradOutputSizeW), + gradInputSize_(gradInputSize), + out_cf_c_stride_(out_cf_c_stride), + in_cf_c_stride_(in_cf_c_stride), + out_n_stride_(out_n_stride), + in_n_stride_(in_n_stride), + kernel_h_(kernel_h), + kernel_w_(kernel_w), + stride_h_(stride_h), + stride_w_(stride_w), + pad_h_(pad_h), + pad_w_(pad_w), + dilation_h_(dilation_h), + dilation_w_(dilation_w) {} + + private: + vec_t* gradInput_; + const vec_t* gradOutput_; + const int64_t* indices_; + int numPlane_; + int gradInputSizeH_; + int gradInputSizeW_; + int gradOutputSizeH_; + int gradOutputSizeW_; + int64_t gradInputSize_; + int out_cf_c_stride_; + int in_cf_c_stride_; + int out_n_stride_; + int in_n_stride_; + int kernel_h_; + int kernel_w_; + int stride_h_; + int stride_w_; + int pad_h_; + int pad_w_; + int dilation_h_; + int dilation_w_; +}; + template void launch_max_pool2d_kernel( scalar_t* output, @@ -397,6 +517,62 @@ void launch_max_pool2d_kernel( sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn); } +#define LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( \ + scalar_t, \ + vec_size, \ + num_wg, \ + wg_size, \ + queue, \ + gradInput, \ + gradOutput, \ + indices, \ + numPlane, \ + gradInputSizeH, \ + gradInputSizeW, \ + gradOutputSizeH, \ + gradOutputSizeW, \ + gradInputSize, \ + out_cf_c_stride, \ + in_cf_c_stride, \ + out_n_stride, \ + in_n_stride, \ + kernel_h, \ + kernel_w, \ + stride_h, \ + stride_w, \ + pad_h, \ + pad_w, \ + dilation_h, \ + dilation_w) \ + { \ + using vec_t = memory::aligned_vector; \ + const vec_t* grad_output_vec = reinterpret_cast(gradOutput); \ + vec_t* grad_input_vec = reinterpret_cast(gradInput); \ + auto kfn = MaxPool2dBackwardChannelLastVec( \ + grad_input_vec, \ + grad_output_vec, \ + indices, \ + numPlane, \ + gradInputSizeH, \ + gradInputSizeW, \ + gradOutputSizeH, \ + gradOutputSizeW, \ + gradInputSize, \ + out_cf_c_stride, \ + in_cf_c_stride, \ + out_n_stride, \ + in_n_stride, \ + kernel_h, \ + kernel_w, \ + stride_h, \ + stride_w, \ + pad_h, \ + pad_w, \ + dilation_h, \ + dilation_w); \ + sycl_kernel_submit(num_wg* wg_size, wg_size, queue, kfn); \ + } + template void launch_max_pool2d_backward_kernel( scalar_t* gradInput, @@ -435,6 +611,119 @@ void launch_max_pool2d_backward_kernel( // with CUDA in alexnet To avoid future problem, we decided to always use // deterministic path. + + // int vec_size = 1; + // int thread_slots = syclGpuEuCount() * syclGpuHWThreadsPerEU(); + // int num_sub_wg; + // auto wg_size = syclDeviceMaxWorkGroupSize(); + // int64_t num_wg; + // if constexpr (is_channels_last) { + // for (vec_size = std::min( + // 8, memory::can_vectorize_up_to((char*)gradOutput)); + // vec_size >= 1; + // vec_size /= 2) { + // if (numPlane % vec_size != 0) { + // continue; + // } + // num_sub_wg = gradInputSize / vec_size / syclMaxSubGroupSize(); + // if (2 * num_sub_wg > thread_slots) { + // int total_thread = gradInputSize / vec_size; + // num_wg = (total_thread + wg_size - 1) / wg_size; + // break; + // } + // } + // switch (vec_size) { + // case 8: + // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + // scalar_t, + // 8, + // num_wg, + // wg_size, + // queue, + // gradInput, + // gradOutput, + // indices, + // numPlane, + // gradInputSizeH, + // gradInputSizeW, + // gradOutputSizeH, + // gradOutputSizeW, + // gradInputSize, + // out_cf_c_stride, + // in_cf_c_stride, + // out_n_stride, + // in_n_stride, + // kernel_h, + // kernel_w, + // stride_h, + // stride_w, + // pad_h, + // pad_w, + // dilation_h, + // dilation_w); + // return; + // case 4: + // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + // scalar_t, + // 1, + // num_wg, + // wg_size, + // queue, + // gradInput, + // gradOutput, + // indices, + // numPlane, + // gradInputSizeH, + // gradInputSizeW, + // gradOutputSizeH, + // gradOutputSizeW, + // gradInputSize, + // out_cf_c_stride, + // in_cf_c_stride, + // out_n_stride, + // in_n_stride, + // kernel_h, + // kernel_w, + // stride_h, + // stride_w, + // pad_h, + // pad_w, + // dilation_h, + // dilation_w); + // return; + // case 2: + // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + // scalar_t, + // 2, + // num_wg, + // wg_size, + // queue, + // gradInput, + // gradOutput, + // indices, + // numPlane, + // gradInputSizeH, + // gradInputSizeW, + // gradOutputSizeH, + // gradOutputSizeW, + // gradInputSize, + // out_cf_c_stride, + // in_cf_c_stride, + // out_n_stride, + // in_n_stride, + // kernel_h, + // kernel_w, + // stride_h, + // stride_w, + // pad_h, + // pad_w, + // dilation_h, + // dilation_w); + // return; + // default: + // break; + // }; + // } using KernelClass = MaxPool2dBackwardDeterministicKernelFunctor; BatchKernelConfig cfg = BatchKernelConfig::make_config( From 76a5583d55d2355128e0fb53b6bef066d4cc9a7f Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:11:11 +0800 Subject: [PATCH 2/6] Update DilatedMaxPool2d.cpp --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 224 +++++++++--------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index b2321ab9fc..7a0b1f28f0 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -612,118 +612,118 @@ void launch_max_pool2d_backward_kernel( // deterministic path. - // int vec_size = 1; - // int thread_slots = syclGpuEuCount() * syclGpuHWThreadsPerEU(); - // int num_sub_wg; - // auto wg_size = syclDeviceMaxWorkGroupSize(); - // int64_t num_wg; - // if constexpr (is_channels_last) { - // for (vec_size = std::min( - // 8, memory::can_vectorize_up_to((char*)gradOutput)); - // vec_size >= 1; - // vec_size /= 2) { - // if (numPlane % vec_size != 0) { - // continue; - // } - // num_sub_wg = gradInputSize / vec_size / syclMaxSubGroupSize(); - // if (2 * num_sub_wg > thread_slots) { - // int total_thread = gradInputSize / vec_size; - // num_wg = (total_thread + wg_size - 1) / wg_size; - // break; - // } - // } - // switch (vec_size) { - // case 8: - // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( - // scalar_t, - // 8, - // num_wg, - // wg_size, - // queue, - // gradInput, - // gradOutput, - // indices, - // numPlane, - // gradInputSizeH, - // gradInputSizeW, - // gradOutputSizeH, - // gradOutputSizeW, - // gradInputSize, - // out_cf_c_stride, - // in_cf_c_stride, - // out_n_stride, - // in_n_stride, - // kernel_h, - // kernel_w, - // stride_h, - // stride_w, - // pad_h, - // pad_w, - // dilation_h, - // dilation_w); - // return; - // case 4: - // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( - // scalar_t, - // 1, - // num_wg, - // wg_size, - // queue, - // gradInput, - // gradOutput, - // indices, - // numPlane, - // gradInputSizeH, - // gradInputSizeW, - // gradOutputSizeH, - // gradOutputSizeW, - // gradInputSize, - // out_cf_c_stride, - // in_cf_c_stride, - // out_n_stride, - // in_n_stride, - // kernel_h, - // kernel_w, - // stride_h, - // stride_w, - // pad_h, - // pad_w, - // dilation_h, - // dilation_w); - // return; - // case 2: - // LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( - // scalar_t, - // 2, - // num_wg, - // wg_size, - // queue, - // gradInput, - // gradOutput, - // indices, - // numPlane, - // gradInputSizeH, - // gradInputSizeW, - // gradOutputSizeH, - // gradOutputSizeW, - // gradInputSize, - // out_cf_c_stride, - // in_cf_c_stride, - // out_n_stride, - // in_n_stride, - // kernel_h, - // kernel_w, - // stride_h, - // stride_w, - // pad_h, - // pad_w, - // dilation_h, - // dilation_w); - // return; - // default: - // break; - // }; - // } + int vec_size = 1; + int thread_slots = syclGpuEuCount() * syclGpuHWThreadsPerEU(); + int num_sub_wg; + auto wg_size = syclDeviceMaxWorkGroupSize(); + int64_t num_wg; + if constexpr (is_channels_last) { + for (vec_size = std::min( + 8, memory::can_vectorize_up_to((char*)gradOutput)); + vec_size >= 1; + vec_size /= 2) { + if (numPlane % vec_size != 0) { + continue; + } + num_sub_wg = gradInputSize / vec_size / syclMaxSubGroupSize(); + if (2 * num_sub_wg > thread_slots) { + int total_thread = gradInputSize / vec_size; + num_wg = (total_thread + wg_size - 1) / wg_size; + break; + } + } + switch (vec_size) { + case 8: + LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + scalar_t, + 8, + num_wg, + wg_size, + queue, + gradInput, + gradOutput, + indices, + numPlane, + gradInputSizeH, + gradInputSizeW, + gradOutputSizeH, + gradOutputSizeW, + gradInputSize, + out_cf_c_stride, + in_cf_c_stride, + out_n_stride, + in_n_stride, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w); + return; + case 4: + LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + scalar_t, + 1, + num_wg, + wg_size, + queue, + gradInput, + gradOutput, + indices, + numPlane, + gradInputSizeH, + gradInputSizeW, + gradOutputSizeH, + gradOutputSizeW, + gradInputSize, + out_cf_c_stride, + in_cf_c_stride, + out_n_stride, + in_n_stride, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w); + return; + case 2: + LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( + scalar_t, + 2, + num_wg, + wg_size, + queue, + gradInput, + gradOutput, + indices, + numPlane, + gradInputSizeH, + gradInputSizeW, + gradOutputSizeH, + gradOutputSizeW, + gradInputSize, + out_cf_c_stride, + in_cf_c_stride, + out_n_stride, + in_n_stride, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w); + return; + default: + break; + }; + } using KernelClass = MaxPool2dBackwardDeterministicKernelFunctor; BatchKernelConfig cfg = BatchKernelConfig::make_config( From 18eb9341f5bf0cb6f6b4b77a9ceaf72be2965808 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:11:30 +0800 Subject: [PATCH 3/6] Update src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 7a0b1f28f0..ec732d8b71 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -265,8 +265,7 @@ struct MaxPool2dBackwardDeterministicKernelFunctor { } } } - } - else { + } else { int offset = batch * out_n_stride_ + plane * out_cf_c_stride_; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { From cefb88a01161090993a2ba9a4dc07d5a71506f3f Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:15:06 +0800 Subject: [PATCH 4/6] remove unnecessary var Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index ec732d8b71..3c87387c02 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -372,7 +372,6 @@ struct MaxPool2dBackwardChannelLastVec { int phend = p_end(inputH, pad_h_, gradOutputSizeH_, stride_h_); int pwstart = p_start(inputW, pad_w_, kernel_w_, dilation_w_, stride_w_); int pwend = p_end(inputW, pad_w_, gradOutputSizeW_, stride_w_); - scalar_t grad = 0; int64_t load_offset, store_offset; store_offset = inputIndex; vec_t grad_vec; From 25d2766f1794c4fc401d7b13398973b6c72061b3 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:17:05 +0800 Subject: [PATCH 5/6] fix --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 3c87387c02..5e8a92f7e7 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -663,7 +663,7 @@ void launch_max_pool2d_backward_kernel( case 4: LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( scalar_t, - 1, + 4, num_wg, wg_size, queue, From 357beb40e7fbd0d571a062332e7a599c8b7c7aff Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Sun, 10 Aug 2025 22:32:26 +0800 Subject: [PATCH 6/6] fix --- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 37 +++++-------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 5e8a92f7e7..a121eb7be4 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -363,8 +363,9 @@ struct MaxPool2dBackwardChannelLastVec { int64_t input_hw_index; plane = inputIndex % (numPlane_ / vec_size); - input_hw_index = - ((inputIndex % in_n_stride_) - plane) / (numPlane_ / vec_size); + + input_hw_index = ((inputIndex % (in_n_stride_ / vec_size)) - plane) / + (numPlane_ / vec_size); int inputW = input_hw_index % gradInputSizeW_; int inputH = input_hw_index / gradInputSizeW_; @@ -372,19 +373,18 @@ struct MaxPool2dBackwardChannelLastVec { int phend = p_end(inputH, pad_h_, gradOutputSizeH_, stride_h_); int pwstart = p_start(inputW, pad_w_, kernel_w_, dilation_w_, stride_w_); int pwend = p_end(inputW, pad_w_, gradOutputSizeW_, stride_w_); - int64_t load_offset, store_offset; - store_offset = inputIndex; vec_t grad_vec; #pragma unroll for (int i = 0; i < vec_size; i++) { grad_vec[i] = 0; } - int offset = batch * (out_n_stride_ / vec_size) + plane; + int offset = batch * out_n_stride_ / vec_size + plane; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - load_offset = - offset + (ph * gradOutputSizeW_ + pw) * (numPlane_ / vec_size); + int load_offset = offset + + ph * gradOutputSizeW_ * numPlane_ / vec_size + + pw * numPlane_ / vec_size; vec_t gout_val_vec = gradOutput_[load_offset]; #pragma unroll for (int i = 0; i < vec_size; i++) { @@ -396,7 +396,7 @@ struct MaxPool2dBackwardChannelLastVec { } } - gradInput_[store_offset] = grad_vec; + gradInput_[inputIndex] = grad_vec; } } MaxPool2dBackwardChannelLastVec( @@ -409,8 +409,6 @@ struct MaxPool2dBackwardChannelLastVec { int gradOutputSizeH, int gradOutputSizeW, int64_t gradInputSize, - int out_cf_c_stride, - int in_cf_c_stride, int out_n_stride, int in_n_stride, int kernel_h, @@ -430,8 +428,6 @@ struct MaxPool2dBackwardChannelLastVec { gradOutputSizeH_(gradOutputSizeH), gradOutputSizeW_(gradOutputSizeW), gradInputSize_(gradInputSize), - out_cf_c_stride_(out_cf_c_stride), - in_cf_c_stride_(in_cf_c_stride), out_n_stride_(out_n_stride), in_n_stride_(in_n_stride), kernel_h_(kernel_h), @@ -453,8 +449,6 @@ struct MaxPool2dBackwardChannelLastVec { int gradOutputSizeH_; int gradOutputSizeW_; int64_t gradInputSize_; - int out_cf_c_stride_; - int in_cf_c_stride_; int out_n_stride_; int in_n_stride_; int kernel_h_; @@ -467,6 +461,7 @@ struct MaxPool2dBackwardChannelLastVec { int dilation_w_; }; + template void launch_max_pool2d_kernel( scalar_t* output, @@ -530,8 +525,6 @@ void launch_max_pool2d_kernel( gradOutputSizeH, \ gradOutputSizeW, \ gradInputSize, \ - out_cf_c_stride, \ - in_cf_c_stride, \ out_n_stride, \ in_n_stride, \ kernel_h, \ @@ -556,8 +549,6 @@ void launch_max_pool2d_kernel( gradOutputSizeH, \ gradOutputSizeW, \ gradInputSize, \ - out_cf_c_stride, \ - in_cf_c_stride, \ out_n_stride, \ in_n_stride, \ kernel_h, \ @@ -609,7 +600,6 @@ void launch_max_pool2d_backward_kernel( // with CUDA in alexnet To avoid future problem, we decided to always use // deterministic path. - int vec_size = 1; int thread_slots = syclGpuEuCount() * syclGpuHWThreadsPerEU(); int num_sub_wg; @@ -630,7 +620,7 @@ void launch_max_pool2d_backward_kernel( break; } } - switch (vec_size) { + switch (vec_size) { case 8: LAUNCH_MAXPOOL_BACKWARD_CHANNEL_LAST_VEC( scalar_t, @@ -647,8 +637,6 @@ void launch_max_pool2d_backward_kernel( gradOutputSizeH, gradOutputSizeW, gradInputSize, - out_cf_c_stride, - in_cf_c_stride, out_n_stride, in_n_stride, kernel_h, @@ -676,8 +664,6 @@ void launch_max_pool2d_backward_kernel( gradOutputSizeH, gradOutputSizeW, gradInputSize, - out_cf_c_stride, - in_cf_c_stride, out_n_stride, in_n_stride, kernel_h, @@ -705,8 +691,6 @@ void launch_max_pool2d_backward_kernel( gradOutputSizeH, gradOutputSizeW, gradInputSize, - out_cf_c_stride, - in_cf_c_stride, out_n_stride, in_n_stride, kernel_h, @@ -934,7 +918,6 @@ void max_pool2d_with_indices_backward_kernel( inputHeight, kH, padH, dH, dilationH, ceil_mode); int64_t outputWidth = pooling_output_shape( inputWidth, kW, padW, dW, dilationW, ceil_mode); - AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16,